HTML5lib Sanitizer

Replaced native Sanitizer with HTML5lib version.
Synced with latest Maruku.
This commit is contained in:
Jacques Distler 2007-05-25 20:52:27 -05:00
parent 457ec8627c
commit 6b21ac484f
36 changed files with 6534 additions and 215 deletions

View file

@ -294,13 +294,13 @@ class WikiController < ApplicationController
def s5 def s5
if @web.markup == :markdownMML if @web.markup == :markdownMML
@s5_content = sanitize_html(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), @s5_content = sanitize_xhtml(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}'], :content_only => true, {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}'], :content_only => true,
:author => @page.author, :title => @page.plain_name}).to_s5).to_ncr :author => @page.author, :title => @page.plain_name}).to_s5.to_ncr)
elsif @web.markup == :markdown elsif @web.markup == :markdown
@s5_content = sanitize_html(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), @s5_content = sanitize_xhtml(Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
{:math_enabled => false, :content_only => true, {:math_enabled => false, :content_only => true,
:author => @page.author, :title => @page.plain_name}).to_s5).to_ncr :author => @page.author, :title => @page.plain_name}).to_s5.to_ncr)
else else
@s5_content = "S5 not supported with this text filter" @s5_content = "S5 not supported with this text filter"
end end

207
attic/lib/sanitize.rb Normal file
View file

@ -0,0 +1,207 @@
module Sanitize
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes.
#
# Based heavily on Sam Ruby's code in the Universal FeedParser.
require 'html/tokenizer'
require 'node'
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
'ul', 'var']
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
'munderover', 'none']
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title',
'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang']
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
'xlink:type', 'xmlns', 'xmlns:xlink']
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant',
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
'lang', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight',
'markerUnits', 'markerWidth', 'mathematical', 'max', 'min', 'name',
'offset', 'opacity', 'orient', 'origin', 'overline-position',
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur',
'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx',
'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity',
'strikethrough-position', 'strikethrough-thickness', 'stroke',
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
'stroke-width', 'systemLanguage', 'target',
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
'underline-position', 'underline-thickness', 'unicode',
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href']
acceptable_css_properties = ['azimuth', 'background-color',
'border-bottom-color', 'border-collapse', 'border-color',
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
'white-space', 'width']
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
'transparent', 'underline', 'white', 'yellow']
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
'stroke-opacity']
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
'ssh', 'sftp', 'rtsp', 'afs' ]
ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS)
ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
# attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
# specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
# ALLOWED_PROTOCOLS are allowed.
# You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_html(html)
if html.index("<")
tokenizer = HTML::Tokenizer.new(html)
new_text = ""
while token = tokenizer.next
node = XHTML::Node.parse(nil, 0, 0, token, false)
new_text << case node.tag?
when true
if ALLOWED_ELEMENTS.include?(node.name)
if node.closing != :close
node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
ATTR_VAL_IS_URI.each do |attr|
val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177\s]+|\302*[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
node.attributes.delete attr
end
end
if node.attributes['style']
node.attributes['style'] = sanitize_css(node.attributes['style'])
end
end
node.to_s
else
node.to_s.gsub(/</, "&lt;")
end
else
node.to_s.gsub(/</, "&lt;")
end
end
html = new_text
end
html
end
def sanitize_css(style)
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
# gauntlet
if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
style = ''
return style
end
if style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
style = ''
return style
end
clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
if ALLOWED_CSS_PROPERTIES.include?(prop.downcase)
clean << prop + ': ' + val + ';'
elsif ['background','border','margin','padding'].include?(prop.split('-')[0].downcase)
goodval = true
val.split().each do |keyword|
if !ALLOWED_CSS_KEYWORDS.include?(keyword) and
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
goodval = false
end
end
if goodval
clean << prop + ': ' + val + ';'
end
elsif ALLOWED_SVG_PROPERTIES.include?(prop.downcase)
clean << prop + ': ' + val + ';'
end
end
style = clean.join(' ')
end
end

View file

@ -0,0 +1,187 @@
#!/usr/bin/env ruby
require File.expand_path(File.join(File.dirname(__FILE__), '/../test_helper'))
require 'sanitize'
class SanitizeTest < Test::Unit::TestCase
include Sanitize
def setup
end
Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
define_method "test_should_allow_#{tag_name}_tag" do
assert_equal "<#{tag_name} title=\"1\">foo &lt;bad>bar&lt;/bad> baz</#{tag_name}>",
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
end
end
Sanitize::ALLOWED_ELEMENTS.each do |tag_name|
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
assert_equal "&lt;#{tag_name.upcase} title=\"1\">foo &lt;bad>bar&lt;/bad> baz&lt;/#{tag_name.upcase}>",
sanitize_html("<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>")
end
end
Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
if attribute_name != 'style'
define_method "test_should_allow_#{attribute_name}_attribute" do
assert_equal "<p #{attribute_name}=\"foo\">foo &lt;bad>bar&lt;/bad> baz</p>",
sanitize_html("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
end
end
end
Sanitize::ALLOWED_ATTRIBUTES.each do |attribute_name|
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
assert_equal "<p>foo &lt;bad>bar&lt;/bad> baz</p>",
sanitize_html("<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>")
end
end
Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_#{protocol}_uris" do
assert_equal "<a href=\"#{protocol}\">foo</a>",
sanitize_html(%(<a href="#{protocol}">foo</a>))
end
end
Sanitize::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
assert_equal "<a href=\"#{protocol.upcase}\">foo</a>",
sanitize_html(%(<a href="#{protocol.upcase}">foo</a>))
end
end
def test_should_allow_anchors
assert_equal "<a href=\"foo\">&lt;script>baz&lt;/script></a>",
sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>")
end
# RFC 3986, sec 4.2
def test_allow_colons_in_path_component
assert_equal "<a href=\"./this:that\">foo</a>",
sanitize_html("<a href=\"./this:that\">foo</a>")
end
%w(src width height alt).each do |img_attr|
define_method "test_should_allow_image_#{img_attr}_attribute" do
assert_equal "<img #{img_attr}=\"foo\" />",
sanitize_html("<img #{img_attr}='foo' onclick='bar' />")
end
end
def test_should_handle_non_html
assert_equal 'abc', sanitize_html("abc")
end
def test_should_handle_blank_text
assert_equal '', sanitize_html('')
end
[%w(img src), %w(a href)].each do |(tag, attr)|
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo</#{tag}>))
end
end
[%w(img src), %w(a href)].each do |(tag, attr)|
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
assert_equal %(<#{tag} title="1">boo</#{tag}>), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo</#{tag}>))
end
end
[%(<img src="javascript:alert('XSS');" />),
%(<img src=javascript:alert('XSS') />),
%(<img src="JaVaScRiPt:alert('XSS')" />),
%(<img src='javascript:alert(&quot;XSS&quot;)' />),
%(<img src='javascript:alert(String.fromCharCode(88,83,83))' />),
%(<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />),
%(<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />),
%(<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />),
%(<img src="jav\tascript:alert('XSS');" />),
%(<img src="jav&#x09;ascript:alert('XSS');" />),
%(<img src="jav&#x0A;ascript:alert('XSS');" />),
%(<img src="jav&#x0D;ascript:alert('XSS');" />),
%(<img src=" &#14; javascript:alert('XSS');" />),
%(<img src="&#x20;javascript:alert('XSS');" />),
%(<img src="&#xA0;javascript:alert('XSS');" />)].each_with_index do |img_hack, i|
define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
assert_equal "<img />", sanitize_html(img_hack)
end
end
def test_should_sanitize_tag_broken_up_by_null
assert_equal "&lt;scr>alert(\"XSS\")&lt;/scr>", sanitize_html(%(<scr\0ipt>alert(\"XSS\")</scr\0ipt>))
end
def test_should_sanitize_invalid_script_tag
assert_equal "&lt;script />&lt;/script>", sanitize_html(%(<script/XSS SRC="http://ha.ckers.org/xss.js"></script>))
end
def test_should_sanitize_script_tag_with_multiple_open_brackets
assert_equal "&lt;&lt;script>alert(\"XSS\");//&lt;&lt;/script>", sanitize_html(%(<<script>alert("XSS");//<</script>))
assert_equal %(&lt;iframe src="http:" />&lt;), sanitize_html(%(<iframe src=http://ha.ckers.org/scriptlet.html\n<))
end
def test_should_sanitize_unclosed_script
assert_equal "&lt;script src=\"http:\" /><b>", sanitize_html(%(<script src=http://ha.ckers.org/xss.js?<b>))
end
def test_should_sanitize_half_open_scripts
assert_equal "<img>", sanitize_html(%(<img src="javascript:alert('XSS')"))
end
def test_should_not_fall_for_ridiculous_hack
img_hack = %(<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />)
assert_equal "<img />", sanitize_html(img_hack)
end
def test_platypus
assert_equal %(<a href=\"http://www.ragingplatypus.com/\" style=\"display: block; width: 100%; height: 100%; background-color: black; background-image: ; background-x: center; background-y: center;\">never trust your upstream platypus</a>),
sanitize_html(%(<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>))
end
def test_xul
assert_equal %(<p style="">fubar</p>),
sanitize_html(%(<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>))
end
def test_input_image
assert_equal %(<input type="image" />),
sanitize_html(%(<input type="image" src="javascript:alert('XSS');" />))
end
def test_non_alpha_non_digit
assert_equal "&lt;script />&lt;/script>",
sanitize_html(%(<script/XSS src="http://ha.ckers.org/xss.js"></script>))
assert_equal "<a>foo</a>",
sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>')
assert_equal "<img />",
sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>')
end
def test_img_dynsrc_lowsrc
assert_equal "<img />",
sanitize_html(%(<img dynsrc="javascript:alert('XSS')" />))
assert_equal "<img />",
sanitize_html(%(<img lowsrc="javascript:alert('XSS')" />))
end
def test_div_background_image_unicode_encoded
assert_equal '<div style="">foo</div>',
sanitize_html(%(<div style="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">foo</div>))
end
def test_div_expression
assert_equal '<div style="">foo</div>',
sanitize_html(%(<div style="width: expression(alert('XSS'));">foo</div>))
end
def test_img_vbscript
assert_equal '<img />',
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
end
end

View file

@ -32,7 +32,7 @@ module Engines
redcloth.filter_html = false redcloth.filter_html = false
redcloth.no_span_caps = false redcloth.no_span_caps = false
html = redcloth.to_html(:textile) html = redcloth.to_html(:textile)
sanitize_html(html) sanitize_xhtml(html)
end end
end end
@ -43,7 +43,7 @@ module Engines
require_dependency 'maruku' require_dependency 'maruku'
require_dependency 'maruku/ext/math' require_dependency 'maruku/ext/math'
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), {:math_enabled => false}).to_html html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), {:math_enabled => false}).to_html
sanitize_html(html).to_ncr sanitize_xhtml(html.to_ncr)
end end
end end
@ -55,7 +55,7 @@ module Engines
require_dependency 'maruku/ext/math' require_dependency 'maruku/ext/math'
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html
sanitize_html(html).to_ncr sanitize_xhtml(html.to_ncr)
end end
end end
@ -68,7 +68,7 @@ module Engines
redcloth.filter_html = false redcloth.filter_html = false
redcloth.no_span_caps = false redcloth.no_span_caps = false
html = redcloth.to_html html = redcloth.to_html
sanitize_html(html) sanitize_xhtml(html)
end end
end end
@ -78,7 +78,7 @@ module Engines
def mask def mask
require_dependency 'rdocsupport' require_dependency 'rdocsupport'
html = RDocSupport::RDocFormatter.new(@content).to_html html = RDocSupport::RDocFormatter.new(@content).to_html
sanitize_html(html) sanitize_xhtml(html)
end end
end end

View file

@ -3,205 +3,24 @@ module Sanitize
# This module provides sanitization of XHTML+MathML+SVG # This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes. # and of inline style attributes.
# #
# Based heavily on Sam Ruby's code in the Universal FeedParser. # Uses the HTML5lib parser, so that the parsing behaviour should
# resemble that of browsers.
require 'html/tokenizer' #
require 'node' # sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML
# sanitize_html() is a case-insensitive sanitizer suitable for HTML
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
'ul', 'var']
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
'munderover', 'none']
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'title',
'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang']
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign', require 'html5lib/sanitizer'
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth', require 'html5lib/html5parser'
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence', require 'html5lib/liberalxmlparser'
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace', include HTML5lib
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
'xlink:type', 'xmlns', 'xmlns:xlink']
def sanitize_xhtml(html)
XHTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
end
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant',
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
'lang', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight',
'markerUnits', 'markerWidth', 'mathematical', 'max', 'min', 'name',
'offset', 'opacity', 'orient', 'origin', 'overline-position',
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur',
'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx',
'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity',
'strikethrough-position', 'strikethrough-thickness', 'stroke',
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
'stroke-width', 'systemLanguage', 'target',
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
'underline-position', 'underline-thickness', 'unicode',
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href']
acceptable_css_properties = ['azimuth', 'background-color',
'border-bottom-color', 'border-collapse', 'border-color',
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
'white-space', 'width']
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
'transparent', 'underline', 'white', 'yellow']
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
'stroke-opacity']
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
'ssh', 'sftp', 'rtsp', 'afs' ]
ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS)
ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES)
ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES)
ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS)
ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES)
ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS)
ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI)
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all
# attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set,
# specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified in
# ALLOWED_PROTOCOLS are allowed.
# You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_html(html) def sanitize_html(html)
if html.index("<") HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s
tokenizer = HTML::Tokenizer.new(html)
new_text = ""
while token = tokenizer.next
node = XHTML::Node.parse(nil, 0, 0, token, false)
new_text << case node.tag?
when true
if ALLOWED_ELEMENTS.include?(node.name)
if node.closing != :close
node.attributes.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
ATTR_VAL_IS_URI.each do |attr|
val_unescaped = CGI.unescapeHTML(node.attributes[attr].to_s).gsub(/[\000-\040\177-\240]+/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
node.attributes.delete attr
end
end
if node.attributes['style']
node.attributes['style'] = sanitize_css(node.attributes['style'])
end
end
node.to_s
else
node.to_s.gsub(/</, "&lt;")
end
else
node.to_s.gsub(/</, "&lt;")
end
end end
html = new_text
end
html
end
def sanitize_css(style)
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
# gauntlet
if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
style = ''
return style
end
if style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
style = ''
return style
end
clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
if ALLOWED_CSS_PROPERTIES.include?(prop.downcase)
clean << prop + ': ' + val + ';'
elsif ['background','border','margin','padding'].include?(prop.split('-')[0].downcase)
goodval = true
val.split().each do |keyword|
if !ALLOWED_CSS_KEYWORDS.include?(keyword) and
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
goodval = false
end
end
if goodval
clean << prop + ': ' + val + ';'
end
elsif ALLOWED_SVG_PROPERTIES.include?(prop.downcase)
clean << prop + ': ' + val + ';'
end
end
style = clean.join(' ')
end
end end

9
vendor/plugins/HTML5lib/README vendored Normal file
View file

@ -0,0 +1,9 @@
= HTML5lib
== Basic Usage
require 'html5lib'
doc = HTML5lib.parse('<html>...</html>')
doc.class # REXML::Document

7
vendor/plugins/HTML5lib/Rakefile.rb vendored Normal file
View file

@ -0,0 +1,7 @@
require 'rake'
require 'rake/testtask'
Rake::TestTask.new do |task|
task.pattern = 'tests/test_*.rb'
task.verbose = true
end

11
vendor/plugins/HTML5lib/lib/html5lib.rb vendored Normal file
View file

@ -0,0 +1,11 @@
require 'html5lib/html5parser'
module HTML5lib
def self.parse(stream, options={})
HTMLParser.parse(stream, options)
end
def self.parseFragment(stream, options={})
HTMLParser.parse(stream, options)
end
end

View file

@ -0,0 +1,676 @@
module HTML5lib
class EOF < Exception; end
CONTENT_MODEL_FLAGS = [
:PCDATA,
:RCDATA,
:CDATA,
:PLAINTEXT
]
SCOPING_ELEMENTS = %w[
button
caption
html
marquee
object
table
td
th
]
FORMATTING_ELEMENTS = %w[
a
b
big
em
font
i
nobr
s
small
strike
strong
tt
u
]
SPECIAL_ELEMENTS = %w[
address
area
base
basefont
bgsound
blockquote
body
br
center
col
colgroup
dd
dir
div
dl
dt
embed
fieldset
form
frame
frameset
h1
h2
h3
h4
h5
h6
head
hr
iframe
image
img
input
isindex
li
link
listing
menu
meta
noembed
noframes
noscript
ol
optgroup
option
p
param
plaintext
pre
script
select
spacer
style
tbody
textarea
tfoot
thead
title
tr
ul
wbr
]
SPACE_CHARACTERS = %W[
\t
\n
\x0B
\x0C
\x20
\r
]
TABLE_INSERT_MODE_ELEMENTS = %w[
table
tbody
tfoot
thead
tr
]
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
DIGITS = '0'..'9'
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
# Heading elements need to be ordered
HEADING_ELEMENTS = %w[
h1
h2
h3
h4
h5
h6
]
# XXX What about event-source and command?
VOID_ELEMENTS = %w[
base
link
meta
hr
br
img
embed
param
area
col
input
]
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN
65533, # 0x81 UNDEFINED
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
8224, # 0x86 0x2020 DAGGER
8225, # 0x87 0x2021 DOUBLE DAGGER
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
8240, # 0x89 0x2030 PER MILLE SIGN
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
65533, # 0x8D UNDEFINED
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
65533, # 0x8F UNDEFINED
65533, # 0x90 UNDEFINED
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
8226, # 0x95 0x2022 BULLET
8211, # 0x96 0x2013 EN DASH
8212, # 0x97 0x2014 EM DASH
732, # 0x98 0x02DC SMALL TILDE
8482, # 0x99 0x2122 TRADE MARK SIGN
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
65533, # 0x9D UNDEFINED
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
]
private
def self.U n
[n].pack('U')
end
public
ENTITIES = {
"AElig" => U(0xC6),
"Aacute" => U(0xC1),
"Acirc" => U(0xC2),
"Agrave" => U(0xC0),
"Alpha" => U(0x0391),
"Aring" => U(0xC5),
"Atilde" => U(0xC3),
"Auml" => U(0xC4),
"Beta" => U(0x0392),
"Ccedil" => U(0xC7),
"Chi" => U(0x03A7),
"Dagger" => U(0x2021),
"Delta" => U(0x0394),
"ETH" => U(0xD0),
"Eacute" => U(0xC9),
"Ecirc" => U(0xCA),
"Egrave" => U(0xC8),
"Epsilon" => U(0x0395),
"Eta" => U(0x0397),
"Euml" => U(0xCB),
"Gamma" => U(0x0393),
"Iacute" => U(0xCD),
"Icirc" => U(0xCE),
"Igrave" => U(0xCC),
"Iota" => U(0x0399),
"Iuml" => U(0xCF),
"Kappa" => U(0x039A),
"Lambda" => U(0x039B),
"Mu" => U(0x039C),
"Ntilde" => U(0xD1),
"Nu" => U(0x039D),
"OElig" => U(0x0152),
"Oacute" => U(0xD3),
"Ocirc" => U(0xD4),
"Ograve" => U(0xD2),
"Omega" => U(0x03A9),
"Omicron" => U(0x039F),
"Oslash" => U(0xD8),
"Otilde" => U(0xD5),
"Ouml" => U(0xD6),
"Phi" => U(0x03A6),
"Pi" => U(0x03A0),
"Prime" => U(0x2033),
"Psi" => U(0x03A8),
"Rho" => U(0x03A1),
"Scaron" => U(0x0160),
"Sigma" => U(0x03A3),
"THORN" => U(0xDE),
"Tau" => U(0x03A4),
"Theta" => U(0x0398),
"Uacute" => U(0xDA),
"Ucirc" => U(0xDB),
"Ugrave" => U(0xD9),
"Upsilon" => U(0x03A5),
"Uuml" => U(0xDC),
"Xi" => U(0x039E),
"Yacute" => U(0xDD),
"Yuml" => U(0x0178),
"Zeta" => U(0x0396),
"aacute" => U(0xE1),
"acirc" => U(0xE2),
"acute" => U(0xB4),
"aelig" => U(0xE6),
"agrave" => U(0xE0),
"alefsym" => U(0x2135),
"alpha" => U(0x03B1),
"amp" => U(0x26),
"AMP" => U(0x26),
"and" => U(0x2227),
"ang" => U(0x2220),
"apos" => U(0x27),
"aring" => U(0xE5),
"asymp" => U(0x2248),
"atilde" => U(0xE3),
"auml" => U(0xE4),
"bdquo" => U(0x201E),
"beta" => U(0x03B2),
"brvbar" => U(0xA6),
"bull" => U(0x2022),
"cap" => U(0x2229),
"ccedil" => U(0xE7),
"cedil" => U(0xB8),
"cent" => U(0xA2),
"chi" => U(0x03C7),
"circ" => U(0x02C6),
"clubs" => U(0x2663),
"cong" => U(0x2245),
"copy" => U(0xA9),
"COPY" => U(0xA9),
"crarr" => U(0x21B5),
"cup" => U(0x222A),
"curren" => U(0xA4),
"dArr" => U(0x21D3),
"dagger" => U(0x2020),
"darr" => U(0x2193),
"deg" => U(0xB0),
"delta" => U(0x03B4),
"diams" => U(0x2666),
"divide" => U(0xF7),
"eacute" => U(0xE9),
"ecirc" => U(0xEA),
"egrave" => U(0xE8),
"empty" => U(0x2205),
"emsp" => U(0x2003),
"ensp" => U(0x2002),
"epsilon" => U(0x03B5),
"equiv" => U(0x2261),
"eta" => U(0x03B7),
"eth" => U(0xF0),
"euml" => U(0xEB),
"euro" => U(0x20AC),
"exist" => U(0x2203),
"fnof" => U(0x0192),
"forall" => U(0x2200),
"frac12" => U(0xBD),
"frac14" => U(0xBC),
"frac34" => U(0xBE),
"frasl" => U(0x2044),
"gamma" => U(0x03B3),
"ge" => U(0x2265),
"gt" => U(0x3E),
"GT" => U(0x3E),
"hArr" => U(0x21D4),
"harr" => U(0x2194),
"hearts" => U(0x2665),
"hellip" => U(0x2026),
"iacute" => U(0xED),
"icirc" => U(0xEE),
"iexcl" => U(0xA1),
"igrave" => U(0xEC),
"image" => U(0x2111),
"infin" => U(0x221E),
"int" => U(0x222B),
"iota" => U(0x03B9),
"iquest" => U(0xBF),
"isin" => U(0x2208),
"iuml" => U(0xEF),
"kappa" => U(0x03BA),
"lArr" => U(0x21D0),
"lambda" => U(0x03BB),
"lang" => U(0x2329),
"laquo" => U(0xAB),
"larr" => U(0x2190),
"lceil" => U(0x2308),
"ldquo" => U(0x201C),
"le" => U(0x2264),
"lfloor" => U(0x230A),
"lowast" => U(0x2217),
"loz" => U(0x25CA),
"lrm" => U(0x200E),
"lsaquo" => U(0x2039),
"lsquo" => U(0x2018),
"lt" => U(0x3C),
"LT" => U(0x3C),
"macr" => U(0xAF),
"mdash" => U(0x2014),
"micro" => U(0xB5),
"middot" => U(0xB7),
"minus" => U(0x2212),
"mu" => U(0x03BC),
"nabla" => U(0x2207),
"nbsp" => U(0xA0),
"ndash" => U(0x2013),
"ne" => U(0x2260),
"ni" => U(0x220B),
"not" => U(0xAC),
"notin" => U(0x2209),
"nsub" => U(0x2284),
"ntilde" => U(0xF1),
"nu" => U(0x03BD),
"oacute" => U(0xF3),
"ocirc" => U(0xF4),
"oelig" => U(0x0153),
"ograve" => U(0xF2),
"oline" => U(0x203E),
"omega" => U(0x03C9),
"omicron" => U(0x03BF),
"oplus" => U(0x2295),
"or" => U(0x2228),
"ordf" => U(0xAA),
"ordm" => U(0xBA),
"oslash" => U(0xF8),
"otilde" => U(0xF5),
"otimes" => U(0x2297),
"ouml" => U(0xF6),
"para" => U(0xB6),
"part" => U(0x2202),
"permil" => U(0x2030),
"perp" => U(0x22A5),
"phi" => U(0x03C6),
"pi" => U(0x03C0),
"piv" => U(0x03D6),
"plusmn" => U(0xB1),
"pound" => U(0xA3),
"prime" => U(0x2032),
"prod" => U(0x220F),
"prop" => U(0x221D),
"psi" => U(0x03C8),
"quot" => U(0x22),
"QUOT" => U(0x22),
"rArr" => U(0x21D2),
"radic" => U(0x221A),
"rang" => U(0x232A),
"raquo" => U(0xBB),
"rarr" => U(0x2192),
"rceil" => U(0x2309),
"rdquo" => U(0x201D),
"real" => U(0x211C),
"reg" => U(0xAE),
"REG" => U(0xAE),
"rfloor" => U(0x230B),
"rho" => U(0x03C1),
"rlm" => U(0x200F),
"rsaquo" => U(0x203A),
"rsquo" => U(0x2019),
"sbquo" => U(0x201A),
"scaron" => U(0x0161),
"sdot" => U(0x22C5),
"sect" => U(0xA7),
"shy" => U(0xAD),
"sigma" => U(0x03C3),
"sigmaf" => U(0x03C2),
"sim" => U(0x223C),
"spades" => U(0x2660),
"sub" => U(0x2282),
"sube" => U(0x2286),
"sum" => U(0x2211),
"sup" => U(0x2283),
"sup1" => U(0xB9),
"sup2" => U(0xB2),
"sup3" => U(0xB3),
"supe" => U(0x2287),
"szlig" => U(0xDF),
"tau" => U(0x03C4),
"there4" => U(0x2234),
"theta" => U(0x03B8),
"thetasym" => U(0x03D1),
"thinsp" => U(0x2009),
"thorn" => U(0xFE),
"tilde" => U(0x02DC),
"times" => U(0xD7),
"trade" => U(0x2122),
"uArr" => U(0x21D1),
"uacute" => U(0xFA),
"uarr" => U(0x2191),
"ucirc" => U(0xFB),
"ugrave" => U(0xF9),
"uml" => U(0xA8),
"upsih" => U(0x03D2),
"upsilon" => U(0x03C5),
"uuml" => U(0xFC),
"weierp" => U(0x2118),
"xi" => U(0x03BE),
"yacute" => U(0xFD),
"yen" => U(0xA5),
"yuml" => U(0xFF),
"zeta" => U(0x03B6),
"zwj" => U(0x200D),
"zwnj" => U(0x200C)
}
ENCODINGS = %w[
ansi_x3.4-1968
iso-ir-6
ansi_x3.4-1986
iso_646.irv:1991
ascii
iso646-us
us-ascii
us
ibm367
cp367
csascii
ks_c_5601-1987
korean
iso-2022-kr
csiso2022kr
euc-kr
iso-2022-jp
csiso2022jp
iso-2022-jp-2
iso-ir-58
chinese
csiso58gb231280
iso_8859-1:1987
iso-ir-100
iso_8859-1
iso-8859-1
latin1
l1
ibm819
cp819
csisolatin1
iso_8859-2:1987
iso-ir-101
iso_8859-2
iso-8859-2
latin2
l2
csisolatin2
iso_8859-3:1988
iso-ir-109
iso_8859-3
iso-8859-3
latin3
l3
csisolatin3
iso_8859-4:1988
iso-ir-110
iso_8859-4
iso-8859-4
latin4
l4
csisolatin4
iso_8859-6:1987
iso-ir-127
iso_8859-6
iso-8859-6
ecma-114
asmo-708
arabic
csisolatinarabic
iso_8859-7:1987
iso-ir-126
iso_8859-7
iso-8859-7
elot_928
ecma-118
greek
greek8
csisolatingreek
iso_8859-8:1988
iso-ir-138
iso_8859-8
iso-8859-8
hebrew
csisolatinhebrew
iso_8859-5:1988
iso-ir-144
iso_8859-5
iso-8859-5
cyrillic
csisolatincyrillic
iso_8859-9:1989
iso-ir-148
iso_8859-9
iso-8859-9
latin5
l5
csisolatin5
iso-8859-10
iso-ir-157
l6
iso_8859-10:1992
csisolatin6
latin6
hp-roman8
roman8
r8
ibm037
cp037
csibm037
ibm424
cp424
csibm424
ibm437
cp437
437
cspc8codepage437
ibm500
cp500
csibm500
ibm775
cp775
cspc775baltic
ibm850
cp850
850
cspc850multilingual
ibm852
cp852
852
cspcp852
ibm855
cp855
855
csibm855
ibm857
cp857
857
csibm857
ibm860
cp860
860
csibm860
ibm861
cp861
861
cp-is
csibm861
ibm862
cp862
862
cspc862latinhebrew
ibm863
cp863
863
csibm863
ibm864
cp864
csibm864
ibm865
cp865
865
csibm865
ibm866
cp866
866
csibm866
ibm869
cp869
869
cp-gr
csibm869
ibm1026
cp1026
csibm1026
koi8-r
cskoi8r
koi8-u
big5-hkscs
ptcp154
csptcp154
pt154
cp154
utf-7
utf-16be
utf-16le
utf-16
utf-8
iso-8859-13
iso-8859-14
iso-ir-199
iso_8859-14:1998
iso_8859-14
latin8
iso-celtic
l8
iso-8859-15
iso_8859-15
iso-8859-16
iso-ir-226
iso_8859-16:2001
iso_8859-16
latin10
l10
gbk
cp936
ms936
gb18030
shift_jis
ms_kanji
csshiftjis
euc-jp
gb2312
big5
csbig5
windows-1250
windows-1251
windows-1252
windows-1253
windows-1254
windows-1255
windows-1256
windows-1257
windows-1258
tis-620
hz-gb-2312
]
end

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,549 @@
require 'stringio'
require 'html5lib/constants'
module HTML5lib
# Provides a unicode stream of characters to the HTMLTokenizer.
# This class takes care of character encoding and removing or replacing
# incorrect byte-sequences and also provides column and line tracking.
class HTMLInputStream
attr_accessor :queue, :charEncoding
# Initialises the HTMLInputStream.
#
# HTMLInputStream(source, [encoding]) -> Normalized stream from source
# for use by the HTML5Lib.
#
# source can be either a file-object, local filename or a string.
#
# The optional encoding parameter must be a string that indicates
# the encoding. If specified, that encoding will be used,
# regardless of any BOM or later declaration (such as in a meta
# element)
#
# parseMeta - Look for a <meta> element containing encoding information
def initialize(source, options = {})
@encoding = nil
@parseMeta = true
@chardet = true
options.each { |name, value| instance_variable_set("@#{name}", value) }
# List of where new lines occur
@newLines = []
# Raw Stream
@rawStream = openStream(source)
# Encoding Information
#Number of bytes to use when looking for a meta element with
#encoding information
@NUM_BYTES_META = 512
#Encoding to use if no other information can be found
@DEFAULT_ENCODING = 'windows-1252'
#Detect encoding iff no explicit "transport level" encoding is supplied
if @encoding.nil? or not HTML5lib.isValidEncoding(@encoding)
@charEncoding = detectEncoding
else
@charEncoding = @encoding
end
# Read bytes from stream decoding them into Unicode
uString = @rawStream.read
unless @charEncoding == 'utf-8'
begin
require 'iconv'
uString = Iconv.iconv('utf-8', @encoding, uString)[0]
rescue
end
end
# Normalize newlines and null characters
uString.gsub!(/\r\n?/, "\n")
uString.gsub!("\x00", [0xFFFD].pack('U'))
# Convert the unicode string into a list to be used as the data stream
@dataStream = uString
@queue = []
# Reset position in the list to read from
reset
end
# Produces a file object from source.
#
# source can be either a file object, local filename or a string.
def openStream(source)
# Already an IO like object
if source.respond_to?(:read)
@stream = source
else
# Treat source as a string and wrap in StringIO
@stream = StringIO.new(source)
end
return @stream
end
def detectEncoding
#First look for a BOM
#This will also read past the BOM if present
encoding = detectBOM
#If there is no BOM need to look for meta elements with encoding
#information
if encoding.nil? and @parseMeta
encoding = detectEncodingMeta
end
#Guess with chardet, if avaliable
if encoding.nil? and @chardet
begin
require 'rubygems'
require 'UniversalDetector' # gem install chardet
buffer = @rawStream.read
encoding = UniversalDetector::chardet(buffer)['encoding']
@rawStream = openStream(buffer)
rescue LoadError
end
end
# If all else fails use the default encoding
if encoding.nil?
encoding = @DEFAULT_ENCODING
end
#Substitute for equivalent encodings:
encodingSub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
if encodingSub.has_key?(encoding.downcase)
encoding = encodingSub[encoding.downcase]
end
return encoding
end
# Attempts to detect at BOM at the start of the stream. If
# an encoding can be determined from the BOM return the name of the
# encoding otherwise return nil
def detectBOM
bomDict = {
"\xef\xbb\xbf" => 'utf-8',
"\xff\xfe" => 'utf-16-le',
"\xfe\xff" => 'utf-16-be',
"\xff\xfe\x00\x00" => 'utf-32-le',
"\x00\x00\xfe\xff" => 'utf-32-be'
}
# Go to beginning of file and read in 4 bytes
@rawStream.seek(0)
string = @rawStream.read(4)
return nil unless string
# Try detecting the BOM using bytes from the string
encoding = bomDict[string[0...3]] # UTF-8
seek = 3
unless encoding
# Need to detect UTF-32 before UTF-16
encoding = bomDict[string] # UTF-32
seek = 4
unless encoding
encoding = bomDict[string[0...2]] # UTF-16
seek = 2
end
end
#AT - move this to the caller?
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
@rawStream.seek(encoding ? seek : 0)
return encoding
end
# Report the encoding declared by the meta element
def detectEncodingMeta
parser = EncodingParser.new(@rawStream.read(@NUM_BYTES_META))
@rawStream.seek(0)
return parser.getEncoding
end
def determineNewLines
# Looks through the stream to find where new lines occur so
# the position method can tell where it is.
@newLines.push(0)
(0...@dataStream.length).each { |i| @newLines.push(i) if @dataStream[i] == ?\n }
end
# Returns (line, col) of the current position in the stream.
def position
# Generate list of new lines first time around
determineNewLines if @newLines.empty?
line = 0
tell = @tell
@newLines.each do |pos|
break unless pos < tell
line += 1
end
col = tell - @newLines[line-1] - 1
return [line, col]
end
# Resets the position in the stream back to the start.
def reset
@tell = 0
end
# Read one character from the stream or queue if available. Return
# EOF when EOF is reached.
def char
unless @queue.empty?
return @queue.shift
else
begin
@tell += 1
return @dataStream[@tell - 1].chr
rescue
return :EOF
end
end
end
# Returns a string of characters from the stream up to but not
# including any character in characters or EOF. characters can be
# any container that supports the in method being called on it.
def charsUntil(characters, opposite = false)
charStack = [char]
unless charStack[0] == :EOF
while (characters.include? charStack[-1]) == opposite
unless @queue.empty?
# First from the queue
charStack.push(@queue.shift)
break if charStack[-1] == :EOF
else
# Then the rest
begin
charStack.push(@dataStream[@tell].chr)
@tell += 1
rescue
charStack.push(:EOF)
break
end
end
end
end
# Put the character stopped on back to the front of the queue
# from where it came.
@queue.insert(0, charStack.pop)
return charStack.join('')
end
end
# String-like object with an assosiated position and various extra methods
# If the position is ever greater than the string length then an exception is raised
class EncodingBytes < String
attr_accessor :position
def initialize(value)
super(value)
@position = -1
end
def each
while @position < length
@position += 1
yield self[@position]
end
rescue EOF
end
def currentByte
raise EOF if @position >= length
return self[@position].chr
end
# Skip past a list of characters
def skip(chars = SPACE_CHARACTERS)
while chars.include?(currentByte)
@position += 1
end
end
# Look for a sequence of bytes at the start of a string. If the bytes
# are found return true and advance the position to the byte after the
# match. Otherwise return false and leave the position alone
def matchBytes(bytes, lower = false)
data = self[position ... position+bytes.length]
data.downcase! if lower
rv = (data == bytes)
@position += bytes.length if rv == true
return rv
end
# Look for the next sequence of bytes matching a given sequence. If
# a match is found advance the position to the last byte of the match
def jumpTo(bytes)
newPosition = self[position .. -1].index(bytes)
if newPosition
@position += (newPosition + bytes.length-1)
return true
else
raise EOF
end
end
# Move the pointer so it points to the next byte in a set of possible
# bytes
def findNext(byteList)
until byteList.include?(currentByte)
@position += 1
end
end
end
# Mini parser for detecting character encoding from meta elements
class EncodingParser
# string - the data to work on for encoding detection
def initialize(data)
@data = EncodingBytes.new(data.to_s)
@encoding = nil
end
@@method_dispatch = [
['<!--', :handleComment],
['<meta', :handleMeta],
['</', :handlePossibleEndTag],
['<!', :handleOther],
['<?', :handleOther],
['<', :handlePossibleStartTag]
]
def getEncoding
@data.each do |byte|
keepParsing = true
@@method_dispatch.each do |(key, method)|
if @data.matchBytes(key, lower = true)
keepParsing = send(method)
break
end
end
break unless keepParsing
end
@encoding = @encoding.strip unless @encoding.nil?
return @encoding
end
# Skip over comments
def handleComment
return @data.jumpTo('-->')
end
def handleMeta
# if we have <meta not followed by a space so just keep going
return true unless SPACE_CHARACTERS.include?(@data.currentByte)
#We have a valid meta element we want to search for attributes
while true
#Try to find the next attribute after the current position
attr = getAttribute
return true if attr.nil?
if attr[0] == 'charset'
tentativeEncoding = attr[1]
if HTML5lib.isValidEncoding(tentativeEncoding)
@encoding = tentativeEncoding
return false
end
elsif attr[0] == 'content'
contentParser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
tentativeEncoding = contentParser.parse
if HTML5lib.isValidEncoding(tentativeEncoding)
@encoding = tentativeEncoding
return false
end
end
end
end
def handlePossibleStartTag
return handlePossibleTag(false)
end
def handlePossibleEndTag
@data.position+=1
return handlePossibleTag(true)
end
def handlePossibleTag(endTag)
unless ASCII_LETTERS.include?(@data.currentByte)
#If the next byte is not an ascii letter either ignore this
#fragment (possible start tag case) or treat it according to
#handleOther
if endTag
@data.position -= 1
handleOther
end
return true
end
@data.findNext(SPACE_CHARACTERS + ['<', '>'])
if @data.currentByte == '<'
#return to the first step in the overall "two step" algorithm
#reprocessing the < byte
@data.position -= 1
else
#Read all attributes
{} until getAttribute.nil?
end
return true
end
def handleOther
return @data.jumpTo('>')
end
# Return a name,value pair for the next attribute in the stream,
# if one is found, or nil
def getAttribute
@data.skip(SPACE_CHARACTERS + ['/'])
if @data.currentByte == '<'
@data.position -= 1
return nil
elsif @data.currentByte == '>'
return nil
end
attrName = []
attrValue = []
spaceFound = false
#Step 5 attribute name
while true
if @data.currentByte == '=' and attrName:
break
elsif SPACE_CHARACTERS.include?(@data.currentByte)
spaceFound = true
break
elsif ['/', '<', '>'].include?(@data.currentByte)
return [attrName.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrName.push(@data.currentByte.downcase)
else
attrName.push(@data.currentByte)
end
#Step 6
@data.position += 1
end
#Step 7
if spaceFound
@data.skip
#Step 8
unless @data.currentByte == '='
@data.position -= 1
return [attrName.join(''), '']
end
end
#XXX need to advance position in both spaces and value case
#Step 9
@data.position += 1
#Step 10
@data.skip
#Step 11
if ["'", '"'].include?(@data.currentByte)
#11.1
quoteChar = @data.currentByte
while true
@data.position+=1
#11.3
if @data.currentByte == quoteChar
@data.position += 1
return [attrName.join(''), attrValue.join('')]
#11.4
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrValue.push(@data.currentByte.downcase)
#11.5
else
attrValue.push(@data.currentByte)
end
end
elsif ['>', '<'].include?(@data.currentByte)
return [attrName.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrValue.push(@data.currentByte.downcase)
else
attrValue.push(@data.currentByte)
end
while true
@data.position +=1
if (SPACE_CHARACTERS + ['>', '<']).include?(@data.currentByte)
return [attrName.join(''), attrValue.join('')]
elsif ASCII_UPPERCASE.include?(@data.currentByte)
attrValue.push(@data.currentByte.downcase)
else
attrValue.push(@data.currentByte)
end
end
end
end
class ContentAttrParser
def initialize(data)
@data = data
end
def parse
begin
#Skip to the first ";"
@data.position = 0
@data.jumpTo(';')
@data.position += 1
@data.skip
#Check if the attr name is charset
#otherwise return
@data.jumpTo('charset')
@data.position += 1
@data.skip
unless @data.currentByte == '='
#If there is no = sign keep looking for attrs
return nil
end
@data.position += 1
@data.skip
#Look for an encoding between matching quote marks
if ['"', "'"].include?(@data.currentByte)
quoteMark = @data.currentByte
@data.position += 1
oldPosition = @data.position
@data.jumpTo(quoteMark)
return @data[oldPosition ... @data.position]
else
#Unquoted value
oldPosition = @data.position
begin
@data.findNext(SPACE_CHARACTERS)
return @data[oldPosition ... @data.position]
rescue EOF
#Return the whole remaining value
return @data[oldPosition .. -1]
end
end
rescue EOF
return nil
end
end
end
# Determine if a string is a supported encoding
def self.isValidEncoding(encoding)
(not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
end
end

View file

@ -0,0 +1,141 @@
# Warning: this module is experimental and subject to change and even removal
# at any time.
#
# For background/rationale, see:
# * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
# * http://tinyurl.com/ylfj8k (and follow-ups)
#
# References:
# * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
# * http://wiki.whatwg.org/wiki/HtmlVsXhtml
#
# @@TODO:
# * Selectively lowercase only XHTML, but not foreign markup
require 'html5lib/html5parser'
require 'html5lib/constants'
module HTML5lib
# liberal XML parser
class XMLParser < HTMLParser
def initialize(options={})
super options
@phases[:initial] = XmlRootPhase.new(self, @tree)
end
def normalizeToken(token)
if token[:type] == :StartTag or token[:type] == :EmptyTag
# We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
token[:data] = Hash[*token[:data].reverse.flatten]
# For EmptyTags, process both a Start and an End tag
if token[:type] == :EmptyTag
@phase.processStartTag(token[:name], token[:data])
token[:data] = {}
token[:type] = :EndTag
end
elsif token[:type] == :EndTag
if token[:data]
parseError(_("End tag contains unexpected attributes."))
end
elsif token[:type] == :Comment
# Rescue CDATA from the comments
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
token[:type] = :Characters
token[:data] = token[:data][7 ... -2]
end
end
return token
end
end
# liberal XMTHML parser
class XHTMLParser < XMLParser
def initialize(options={})
super options
@phases[:initial] = InitialPhase.new(self, @tree)
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
end
def normalizeToken(token)
super(token)
# ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted
if token[:type] == :EndTag and \
not VOID_ELEMENTS.include? token[:name] and \
token[:name] == @tree.openElements[-1].name and \
not @tree.openElements[-1].hasContent
@tree.insertText('') unless
@tree.openElements.any? {|e|
e.attributes.keys.include? 'xmlns' and
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
}
end
return token
end
end
class XhmlRootPhase < RootElementPhase
def insertHtmlElement
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
@tree.openElements.push(element)
@tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead]
end
end
class XmlRootPhase < Phase
# Prime the Xml parser
@start_tag_handlers = Hash.new(:startTagOther)
@end_tag_handlers = Hash.new(:endTagOther)
def startTagOther(name, attributes)
@tree.openElements.push(@tree.document)
element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element)
@tree.openElements.push(element)
@parser.phase = XmlElementPhase.new(@parser,@tree)
end
def endTagOther(name)
super
@tree.openElements.pop
end
end
class XmlElementPhase < Phase
# Generic handling for all XML elements
@start_tag_handlers = Hash.new(:startTagOther)
@end_tag_handlers = Hash.new(:endTagOther)
def startTagOther(name, attributes)
element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element)
@tree.openElements.push(element)
end
def endTagOther(name)
for node in @tree.openElements.reverse
if node.name == name
{} while @tree.openElements.pop != node
break
else
@parser.parseError
end
end
end
def processCharacters(data)
@tree.insertText(data)
end
end
end

View file

@ -0,0 +1,178 @@
require 'html5lib/tokenizer'
require 'cgi'
module HTML5lib
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes.
class HTMLSanitizer < HTMLTokenizer
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
button caption center cite code col colgroup dd del dfn dir div dl dt
em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
legend li map menu ol optgroup option p pre q s samp select small span
strike strong sub sup table tbody td textarea tfoot th thead tr tt u
ul var]
MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo
mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub
msubsup msup mtable mtd mtext mtr munder munderover none]
SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
circle defs desc ellipse font-face font-face-name font-face-src g
glyph hkern image linearGradient line marker metadata missing-glyph
mpath path polygon polyline radialGradient rect set stop svg switch
text title tspan use]
ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
align alt axis border cellpadding cellspacing char charoff charset
checked cite class clear cols colspan color compact coords datetime
dir disabled enctype for frame headers height href hreflang hspace id
ismap label lang longdesc maxlength media method multiple name nohref
noshade nowrap prompt readonly rel rev rows rowspan rules scope
selected shape size span src start style summary tabindex target title
type usemap valign value vspace width xml:lang]
MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign
columnalign columnlines columnspacing columnspan depth display
displaystyle equalcolumns equalrows fence fontstyle fontweight frame
height linethickness lspace mathbackground mathcolor mathvariant
mathvariant maxsize minsize other rowalign rowalign rowalign rowlines
rowspacing rowspan rspace scriptlevel selection separator stretchy
width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
arabic-form ascent attributeName attributeType baseProfile bbox begin
by calcMode cap-height class color color-rendering content cx cy d dx
dy descent display dur end fill fill-rule font-family font-size
font-stretch font-style font-variant font-weight from fx fy g1 g2
glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
ideographic k keyPoints keySplines keyTimes lang marker-end
marker-mid marker-start markerHeight markerUnits markerWidth
mathematical max min name offset opacity orient origin
overline-position overline-thickness panose-1 path pathLength points
preserveAspectRatio r refX refY repeatCount repeatDur
requiredExtensions requiredFeatures restart rotate rx ry slope stemh
stemv stop-color stop-opacity strikethrough-position
strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
stroke-width systemLanguage target text-anchor to transform type u1
u2 underline-position underline-thickness unicode unicode-range
units-per-em values version viewBox visibility width widths x
x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
xmlns:xlink y y1 y2 zoomAndPan]
ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href]
ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
border-bottom-color border-collapse border-color border-left-color
border-right-color border-top-color clear color cursor direction
display elevation float font font-family font-size font-style
font-variant font-weight height letter-spacing line-height overflow
pause pause-after pause-before pitch pitch-range richness speak
speak-header speak-numeral speak-punctuation speech-rate stress
text-align text-decoration text-indent unicode-bidi vertical-align
voice-family volume white-space width]
ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
brown center collapse dashed dotted fuchsia gray green !important
italic left lime maroon medium none navy normal nowrap olive pointer
purple red right solid silver teal top transparent underline white
yellow]
ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
stroke-width stroke-linecap stroke-linejoin stroke-opacity]
ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
# subclasses may define their own versions of these constants
ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
# attributes are parsed, and a restricted set, # specified by
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
# in ALLOWED_PROTOCOLS are allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def each
super do |token|
case token[:type]
when :StartTag, :EndTag, :EmptyTag
if ALLOWED_ELEMENTS.include?(token[:name])
if token.has_key? :data
attrs = Hash[*token[:data].flatten]
attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
ATTR_VAL_IS_URI.each do |attr|
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
attrs.delete attr
end
end
if attrs['style']
attrs['style'] = sanitize_css(attrs['style'])
end
token[:data] = attrs.map {|k,v| [k,v]}
end
yield token
else
if token[:type] == :EndTag
token[:data] = "</#{token[:name]}>"
elsif token[:data]
attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
token[:data] = "<#{token[:name]}#{attrs}>"
else
token[:data] = "<#{token[:name]}>"
end
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
token[:type] = :Characters
token.delete(:name)
yield token
end
else
yield token
end
end
end
def sanitize_css(style)
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
# gauntlet
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
next if val.empty?
prop.downcase!
if ALLOWED_CSS_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
elsif %w[background border margin padding].include?(prop.split('-')[0])
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
!ALLOWED_CSS_KEYWORDS.include?(keyword) and
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
end
elsif ALLOWED_SVG_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
end
end
style = clean.join(' ')
end
end
end

View file

@ -0,0 +1,854 @@
require 'html5lib/constants'
require 'html5lib/inputstream'
module HTML5lib
# This class takes care of tokenizing HTML.
#
# * @currentToken
# Holds the token that is currently being processed.
#
# * @state
# Holds a reference to the method to be invoked... XXX
#
# * @states
# Holds a mapping between states and methods that implement the state.
#
# * @stream
# Points to HTMLInputStream object.
class HTMLTokenizer
attr_accessor :contentModelFlag, :currentToken
attr_reader :stream
# XXX need to fix documentation
def initialize(stream, options={})
@stream = HTMLInputStream.new(stream, options)
@states = {
:data => :dataState,
:entityData => :entityDataState,
:tagOpen => :tagOpenState,
:closeTagOpen => :closeTagOpenState,
:tagName => :tagNameState,
:beforeAttributeName => :beforeAttributeNameState,
:attributeName => :attributeNameState,
:afterAttributeName => :afterAttributeNameState,
:beforeAttributeValue => :beforeAttributeValueState,
:attributeValueDoubleQuoted => :attributeValueDoubleQuotedState,
:attributeValueSingleQuoted => :attributeValueSingleQuotedState,
:attributeValueUnQuoted => :attributeValueUnQuotedState,
:bogusComment => :bogusCommentState,
:markupDeclarationOpen => :markupDeclarationOpenState,
:comment => :commentState,
:commentDash => :commentDashState,
:commentEnd => :commentEndState,
:doctype => :doctypeState,
:beforeDoctypeName => :beforeDoctypeNameState,
:doctypeName => :doctypeNameState,
:afterDoctypeName => :afterDoctypeNameState,
:bogusDoctype => :bogusDoctypeState
}
# Setup the initial tokenizer state
@contentModelFlag = :PCDATA
@state = @states[:data]
# The current token being created
@currentToken = nil
# Tokens to be processed.
@tokenQueue = []
end
# This is where the magic happens.
#
# We do our usually processing through the states and when we have a token
# to return we yield the token which pauses processing until the next token
# is requested.
def each
@stream.reset
@tokenQueue = []
# Start processing. When EOF is reached @state will return false
# instead of true and the loop will terminate.
while send @state
while not @tokenQueue.empty?
yield @tokenQueue.shift
end
end
end
# Below are various helper functions the tokenizer states use worked out.
# If the next character is a '>', convert the currentToken into
# an EmptyTag
def processSolidusInTag
# We need to consume another character to make sure it's a ">"
data = @stream.char
if @currentToken[:type] == :StartTag and data == ">"
@currentToken[:type] = :EmptyTag
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Solidus (/) incorrectly placed in tag.")})
end
# The character we just consumed need to be put back on the stack so it
# doesn't get lost...
@stream.queue.push(data)
end
# This function returns either U+FFFD or the character based on the
# decimal or hexadecimal representation. It also discards ";" if present.
# If not present @tokenQueue.push({:type => :ParseError}") is invoked.
def consumeNumberEntity(isHex)
# XXX More need to be done here. For instance, #13 should prolly be
# converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
# such. Thoughts on this appreciated.
allowed = DIGITS
radix = 10
if isHex
allowed = HEX_DIGITS
radix = 16
end
char = [0xFFFD].pack('U')
charStack = []
# Consume all the characters that are in range while making sure we
# don't hit an EOF.
c = @stream.char
while allowed.include?(c) and c != :EOF
charStack.push(c)
c = @stream.char
end
# Convert the set of characters consumed to an int.
charAsInt = charStack.join('').to_i(radix)
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
# smaller) we need to do the "windows trick".
if (127...160).include? charAsInt
#XXX - removed parse error from windows 1252 entity for now
#we may want to reenable this later
#@tokenQueue.push({:type => :ParseError, :data =>
# _("Entity used with illegal number (windows-1252 reference).")})
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
end
# 0 is not a good number.
if charAsInt == 0
charAsInt = 65533
end
if charAsInt <= 0x10FFF
char = [charAsInt].pack('U')
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity couldn't be converted to character.")})
end
# Discard the ; if present. Otherwise, put it back on the queue and
# invoke parseError on parser.
if c != ";"
@tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity didn't end with ';'.")})
@stream.queue.push(c)
end
return char
end
def consumeEntity
char = nil
charStack = [@stream.char]
if charStack[0] == "#"
# We might have a number entity here.
charStack += [@stream.char, @stream.char]
if charStack.include? :EOF
# If we reach the end of the file put everything up to :EOF
# back in the queue
charStack = charStack[0...charStack.index(:EOF)]
@stream.queue+= charStack
@tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity expected. Got end of file instead.")})
else
if charStack[1].downcase == "x" \
and HEX_DIGITS.include? charStack[2]
# Hexadecimal entity detected.
@stream.queue.push(charStack[2])
char = consumeNumberEntity(true)
elsif DIGITS.include? charStack[1]
# Decimal entity detected.
@stream.queue += charStack[1..-1]
char = consumeNumberEntity(false)
else
# No number entity detected.
@stream.queue += charStack
@tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity expected but none found.")})
end
end
# Break out if we reach the end of the file
elsif charStack[0] == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Entity expected. Got end of file instead.")})
else
# At this point in the process might have named entity. Entities
# are stored in the global variable "entities".
#
# Consume characters and compare to these to a substring of the
# entity names in the list until the substring no longer matches.
filteredEntityList = ENTITIES.keys
filteredEntityList.reject! {|e| e[0].chr != charStack[0]}
entityName = nil
while charStack[-1] != :EOF
name = charStack.join('')
if filteredEntityList.any? {|e| e[0...name.length] == name}
filteredEntityList.reject! {|e| e[0...name.length] != name}
charStack.push(@stream.char)
else
break
end
if ENTITIES.include? name
entityName = name
end
end
if entityName != nil
char = ENTITIES[entityName]
# Check whether or not the last character returned can be
# discarded or needs to be put back.
if not charStack[-1] == ";"
@tokenQueue.push({:type => :ParseError, :data =>
_("Named entity didn't end with ';'.")})
@stream.queue += charStack[entityName.length..-1]
end
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Named entity expected. Got none.")})
@stream.queue += charStack
end
end
return char
end
# This method replaces the need for "entityInAttributeValueState".
def processEntityInAttribute
entity = consumeEntity
if entity
@currentToken[:data][-1][1] += entity
else
@currentToken[:data][-1][1] += "&"
end
end
# This method is a generic handler for emitting the tags. It also sets
# the state to "data" because that's what's needed after a token has been
# emitted.
def emitCurrentToken
# Add token to the queue to be yielded
@tokenQueue.push(@currentToken)
@state = @states[:data]
end
# Below are the various tokenizer states worked out.
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
# documents to figure out what the order of the various if and elsif
# statements should be.
def dataState
data = @stream.char
if data == "&" and (@contentModelFlag == :PCDATA or
@contentModelFlag == :RCDATA)
@state = @states[:entityData]
elsif data == "<" and @contentModelFlag != :PLAINTEXT
@state = @states[:tagOpen]
elsif data == :EOF
# Tokenization ends.
return false
elsif SPACE_CHARACTERS.include? data
# Directly after emitting a token you switch back to the "data
# state". At that point SPACE_CHARACTERS are important so they are
# emitted separately.
# XXX need to check if we don't need a special "spaces" flag on
# characters.
@tokenQueue.push({:type => :SpaceCharacters, :data =>
data + @stream.charsUntil(SPACE_CHARACTERS, true)})
else
@tokenQueue.push({:type => :Characters, :data =>
data + @stream.charsUntil(["&", "<"])})
end
return true
end
def entityDataState
entity = consumeEntity
if entity
@tokenQueue.push({:type => :Characters, :data => entity})
else
@tokenQueue.push({:type => :Characters, :data => "&"})
end
@state = @states[:data]
return true
end
def tagOpenState
data = @stream.char
if @contentModelFlag == :PCDATA
if data == "!"
@state = @states[:markupDeclarationOpen]
elsif data == "/"
@state = @states[:closeTagOpen]
elsif data != :EOF and ASCII_LETTERS.include? data
@currentToken =\
{:type => :StartTag, :name => data, :data => []}
@state = @states[:tagName]
elsif data == ">"
# XXX In theory it could be something besides a tag name. But
# do we really care?
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected tag name. Got '>' instead.")})
@tokenQueue.push({:type => :Characters, :data => "<>"})
@state = @states[:data]
elsif data == "?"
# XXX In theory it could be something besides a tag name. But
# do we really care?
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected tag name. Got '?' instead (HTML doesn't " +
"support processing instructions).")})
@stream.queue.push(data)
@state = @states[:bogusComment]
else
# XXX
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected tag name. Got something else instead")})
@tokenQueue.push({:type => :Characters, :data => "<"})
@stream.queue.push(data)
@state = @states[:data]
end
else
# We know the content model flag is set to either RCDATA or CDATA
# now because this state can never be entered with the PLAINTEXT
# flag.
if data == "/"
@state = @states[:closeTagOpen]
else
@tokenQueue.push({:type => :Characters, :data => "<"})
@stream.queue.insert(0, data)
@state = @states[:data]
end
end
return true
end
def closeTagOpenState
if (@contentModelFlag == :RCDATA or @contentModelFlag == :CDATA)
if @currentToken
charStack = []
# So far we know that "</" has been consumed. We now need to know
# whether the next few characters match the name of last emitted
# start tag which also happens to be the currentToken. We also need
# to have the character directly after the characters that could
# match the start tag name.
(@currentToken[:name].length + 1).times do
charStack.push(@stream.char)
# Make sure we don't get hit by :EOF
break if charStack[-1] == :EOF
end
# Since this is just for checking. We put the characters back on
# the stack.
@stream.queue += charStack
end
if @currentToken and
@currentToken[:name].downcase ==
charStack[0...-1].join('').downcase and
(SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? charStack[-1]
# Because the characters are correct we can safely switch to
# PCDATA mode now. This also means we don't have to do it when
# emitting the end tag token.
@contentModelFlag = :PCDATA
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag after seeing '</'. None found.")})
@tokenQueue.push({:type => :Characters, :data => "</"})
@state = @states[:data]
# Need to return here since we don't want the rest of the
# method to be walked through.
return true
end
end
if @contentModelFlag == :PCDATA
data = @stream.char
if data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag. Unexpected end of file.")})
@tokenQueue.push({:type => :Characters, :data => "</"})
@state = @states[:data]
elsif ASCII_LETTERS.include? data
@currentToken =\
{:type => :EndTag, :name => data, :data => []}
@state = @states[:tagName]
elsif data == ">"
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
@state = @states[:data]
else
# XXX data can be _'_...
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag. Unexpected character '" + data + "' found.")})
@stream.queue.push(data)
@state = @states[:bogusComment]
end
end
return true
end
def tagNameState
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = @states[:beforeAttributeName]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in the tag name.")})
emitCurrentToken
elsif ASCII_LETTERS.include? data
@currentToken[:name] += data +\
@stream.charsUntil(ASCII_LETTERS, true)
elsif data == ">"
emitCurrentToken
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character when getting the tag name.")})
emitCurrentToken
elsif data == "/"
processSolidusInTag
@state = @states[:beforeAttributeName]
else
@currentToken[:name] += data
end
return true
end
def beforeAttributeNameState
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.charsUntil(SPACE_CHARACTERS, true)
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file. Expected attribute name instead.")})
emitCurrentToken
elsif ASCII_LETTERS.include? data
@currentToken[:data].push([data, ""])
@state = @states[:attributeName]
elsif data == ">"
emitCurrentToken
elsif data == "/"
processSolidusInTag
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character. Expected attribute name instead.")})
emitCurrentToken
else
@currentToken[:data].push([data, ""])
@state = @states[:attributeName]
end
return true
end
def attributeNameState
data = @stream.char
leavingThisState = true
if data == "="
@state = @states[:beforeAttributeValue]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in attribute name.")})
emitCurrentToken
leavingThisState = false
elsif ASCII_LETTERS.include? data
@currentToken[:data][-1][0] += data +\
@stream.charsUntil(ASCII_LETTERS, true)
leavingThisState = false
elsif data == ">"
# XXX If we emit here the attributes are converted to a dict
# without being checked and when the code below runs we error
# because data is a dict not a list
elsif SPACE_CHARACTERS.include? data
@state = @states[:afterAttributeName]
elsif data == "/"
processSolidusInTag
@state = @states[:beforeAttributeName]
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character in attribute name.")})
emitCurrentToken
leavingThisState = false
else
@currentToken[:data][-1][0] += data
leavingThisState = false
end
if leavingThisState
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
@currentToken[:data][0...-1].each {|name,value|
if @currentToken[:data][-1][0] == name
@tokenQueue.push({:type => :ParseError, :data =>
_("Dropped duplicate attribute on tag.")})
end
}
# XXX Fix for above XXX
if data == ">"
emitCurrentToken
end
end
return true
end
def afterAttributeNameState
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.charsUntil(SPACE_CHARACTERS, true)
elsif data == "="
@state = @states[:beforeAttributeValue]
elsif data == ">"
emitCurrentToken
elsif ASCII_LETTERS.include? data
@currentToken[:data].push([data, ""])
@state = @states[:attributeName]
elsif data == "/"
processSolidusInTag
@state = @states[:beforeAttributeName]
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character. Expected = or end of tag.")})
emitCurrentToken
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file. Expected = or end of tag.")})
emitCurrentToken
else
@currentToken[:data].push([data, ""])
@state = @states[:attributeName]
end
return true
end
def beforeAttributeValueState
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.charsUntil(SPACE_CHARACTERS, true)
elsif data == "\""
@state = @states[:attributeValueDoubleQuoted]
elsif data == "&"
@state = @states[:attributeValueUnQuoted]
@stream.queue.push(data);
elsif data == "'"
@state = @states[:attributeValueSingleQuoted]
elsif data == ">"
emitCurrentToken
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character. Expected attribute value.")})
emitCurrentToken
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file. Expected attribute value.")})
emitCurrentToken
else
@currentToken[:data][-1][1] += data
@state = @states[:attributeValueUnQuoted]
end
return true
end
def attributeValueDoubleQuotedState
data = @stream.char
if data == "\""
@state = @states[:beforeAttributeName]
elsif data == "&"
processEntityInAttribute
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in attribute value (\").")})
emitCurrentToken
else
@currentToken[:data][-1][1] += data +\
@stream.charsUntil(["\"", "&"])
end
return true
end
def attributeValueSingleQuotedState
data = @stream.char
if data == "'"
@state = @states[:beforeAttributeName]
elsif data == "&"
processEntityInAttribute
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in attribute value (').")})
emitCurrentToken
else
@currentToken[:data][-1][1] += data +\
@stream.charsUntil(["'", "&"])
end
return true
end
def attributeValueUnQuotedState
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = @states[:beforeAttributeName]
elsif data == "&"
processEntityInAttribute
elsif data == ">"
emitCurrentToken
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character in attribute value.")})
emitCurrentToken
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in attribute value.")})
emitCurrentToken
else
@currentToken[:data][-1][1] += data +
@stream.charsUntil(["&", ">","<"] + SPACE_CHARACTERS)
end
return true
end
def bogusCommentState
# Make a new comment token and give it as value all the characters
# until the first > or :EOF (charsUntil checks for :EOF automatically)
# and emit it.
@tokenQueue.push(
{:type => :Comment, :data => @stream.charsUntil((">"))})
# Eat the character directly after the bogus comment which is either a
# ">" or an :EOF.
@stream.char
@state = @states[:data]
return true
end
def markupDeclarationOpenState
charStack = [@stream.char, @stream.char]
if charStack == ["-", "-"]
@currentToken = {:type => :Comment, :data => ""}
@state = @states[:comment]
else
5.times { charStack.push(@stream.char) }
# Put in explicit :EOF check
if ((not charStack.include? :EOF) and
charStack.join("").upcase == "DOCTYPE")
@currentToken =\
{:type => :Doctype, :name => "", :data => true}
@state = @states[:doctype]
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected '--' or 'DOCTYPE'. Not found.")})
@stream.queue += charStack
@state = @states[:bogusComment]
end
end
return true
end
def commentState
data = @stream.char
if data == "-"
@state = @states[:commentDash]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in comment.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:data] += data + @stream.charsUntil("-")
end
return true
end
def commentDashState
data = @stream.char
if data == "-"
@state = @states[:commentEnd]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in comment (-)")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:data] += "-" + data +\
@stream.charsUntil("-")
# Consume the next character which is either a "-" or an :EOF as
# well so if there's a "-" directly after the "-" we go nicely to
# the "comment end state" without emitting a ParseError there.
@stream.char
end
return true
end
def commentEndState
data = @stream.char
if data == ">"
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == "-"
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected '-' after '--' found in comment.")})
@currentToken[:data] += data
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in comment (--).")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
# XXX
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected character in comment found.")})
@currentToken[:data] += "--" + data
@state = @states[:comment]
end
return true
end
def doctypeState
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = @states[:beforeDoctypeName]
else
@tokenQueue.push({:type => :ParseError, :data =>
_("No space after literal string 'DOCTYPE'.")})
@stream.queue.push(data)
@state = @states[:beforeDoctypeName]
end
return true
end
def beforeDoctypeNameState
data = @stream.char
if SPACE_CHARACTERS.include? data
elsif ASCII_LOWERCASE.include? data
@currentToken[:name] = data.upcase
@state = @states[:doctypeName]
elsif data == ">"
# Character needs to be consumed per the specification so don't
# invoke emitCurrentTokenWithParseError with :data as argument.
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected > character. Expected DOCTYPE name.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file. Expected DOCTYPE name.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:name] = data
@state = @states[:doctypeName]
end
return true
end
def doctypeNameState
data = @stream.char
needsDoctypeCheck = false
if SPACE_CHARACTERS.include? data
@state = @states[:afterDoctypeName]
needsDoctypeCheck = true
elsif data == ">"
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE name.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
# We can't just uppercase everything that arrives here. For
# instance, non-ASCII characters.
if ASCII_LOWERCASE.include? data
data = data.upcase
end
@currentToken[:name] += data
needsDoctypeCheck = true
end
# After some iterations through this state it should eventually say
# "HTML". Otherwise there's an error.
if needsDoctypeCheck and @currentToken[:name] == "HTML"
@currentToken[:data] = false
end
return true
end
def afterDoctypeNameState
data = @stream.char
if SPACE_CHARACTERS.include? data
elsif data == ">"
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
@currentToken[:data] = true
# XXX EMIT
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected space or '>'. Got '" + data + "'")})
@currentToken[:data] = true
@state = @states[:bogusDoctype]
end
return true
end
def bogusDoctypeState
data = @stream.char
if data == ">"
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
# XXX EMIT
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in bogus doctype.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
end
return true
end
def _(string); string; end
end
end

View file

@ -0,0 +1,21 @@
module HTML5lib
module TreeBuilders
def self.getTreeBuilder(name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treebuilders/simpletree'
SimpleTree::TreeBuilder
when 'rexml' then
require 'html5lib/treebuilders/rexml'
REXMLTree::TreeBuilder
when 'hpricot' then
require 'html5lib/treebuilders/hpricot'
Hpricot::TreeBuilder
else
raise "Unknown TreeBuilder #{name}"
end
end
end
end

View file

@ -0,0 +1,330 @@
require 'html5lib/constants'
#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
module HTML5lib
# The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, buttons, object elements, and marquees.
Marker = nil
module TreeBuilders
module Base
class Node
# The parent of the current node (or nil for the document node)
attr_accessor :parent
# a list of child nodes of the current node. This must
# include all elements but not necessarily other node types
attr_accessor :childNodes
# A list of miscellaneous flags that can be set on the node
attr_accessor :_flags
def initialize(name)
@parent = nil
@childNodes = []
@_flags = []
end
# Insert node as a child of the current node
def appendChild(node)
raise NotImplementedError
end
# Insert data as text in the current node, positioned before the
# start of node insertBefore or to the end of the node's text.
def insertText(data, insertBefore = nil)
raise NotImplementedError
end
# Insert node as a child of the current node, before refNode in the
# list of child nodes. Raises ValueError if refNode is not a child of
# the current node
def insertBefore(node, refNode)
raise NotImplementedError
end
# Remove node from the children of the current node
def removeChild(node)
raise NotImplementedError
end
# Move all the children of the current node to newParent.
# This is needed so that trees that don't store text as nodes move the
# text in the correct way
def reparentChildren(newParent)
#XXX - should this method be made more general?
@childNodes.each { |child| newParent.appendChild(child) }
@childNodes = []
end
# Return a shallow copy of the current node i.e. a node with the same
# name and attributes but with no parent or child nodes
def cloneNode
raise NotImplementedError
end
# Return true if the node has children or text, false otherwise
def hasContent
raise NotImplementedError
end
end
# Base treebuilder implementation
class TreeBuilder
attr_accessor :openElements
attr_accessor :activeFormattingElements
attr_accessor :document
attr_accessor :headPointer
attr_accessor :formPointer
# Class to use for document root
documentClass = nil
# Class to use for HTML elements
elementClass = nil
# Class to use for comments
commentClass = nil
# Class to use for doctypes
doctypeClass = nil
# Fragment class
fragmentClass = nil
def initialize
reset
end
def reset
@openElements = []
@activeFormattingElements = []
#XXX - rename these to headElement, formElement
@headPointer = nil
@formPointer = nil
self.insertFromTable = false
@document = @documentClass.new
end
def elementInScope(target, tableVariant = false)
# Exit early when possible.
return true if @openElements[-1].name == target
# AT How about while true and simply set node to [-1] and set it to
# [-2] at the end...
@openElements.reverse.each do |element|
if element.name == target
return true
elsif element.name == 'table'
return false
elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
return false
elsif element.name == 'html'
return false
end
end
assert false # We should never reach this point
end
def reconstructActiveFormattingElements
# Within this algorithm the order of steps described in the
# specification is not quite the same as the order of steps in the
# code. It should still do the same though.
# Step 1: stop the algorithm when there's nothing to do.
return unless @activeFormattingElements
# Step 2 and step 3: we start with the last element. So i is -1.
i = -1
entry = @activeFormattingElements[i]
return if entry == Marker or @openElements.include?(entry)
# Step 6
until entry == Marker or @openElements.include?(entry)
# Step 5: let entry be one earlier in the list.
i -= 1
begin
entry = @activeFormattingElements[i]
rescue
# Step 4: at this point we need to jump to step 8. By not doing
# i += 1 which is also done in step 7 we achieve that.
break
end
end
while true
# Step 7
i += 1
# Step 8
clone = @activeFormattingElements[i].cloneNode
# Step 9
element = insertElement(clone.name, clone.attributes)
# Step 10
@activeFormattingElements[i] = element
# Step 11
break if element == @activeFormattingElements[-1]
end
end
def clearActiveFormattingElements
{} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
end
# Check if an element exists between the end of the active
# formatting elements and the last marker. If it does, return it, else
# return false
def elementInActiveFormattingElements(name)
@activeFormattingElements.reverse.each do |element|
# Check for Marker first because if it's a Marker it doesn't have a
# name attribute.
break if element == Marker
return element if element.name == name
end
return false
end
def insertDoctype(name)
@document.appendChild(@doctypeClass.new(name))
end
def insertComment(data, parent = nil)
parent = @openElements[-1] if parent.nil?
parent.appendChild(@commentClass.new(data))
end
# Create an element but don't insert it anywhere
def createElement(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
return element
end
# Switch the function used to insert an element from the
# normal one to the misnested table one and back again
def insertFromTable=(value)
@insertFromTable = value
@insertElement = value ? :insertElementTable : :insertElementNormal
end
def insertElement(name, attributes)
send(@insertElement, name, attributes)
end
def insertElementNormal(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
@openElements[-1].appendChild(element)
@openElements.push(element)
return element
end
# Create an element and insert it into the tree
def insertElementTable(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
if TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)
#We should be in the InTable mode. This means we want to do
#special magic element rearranging
parent, insertBefore = getTableMisnestedNodePosition
if insertBefore.nil?
parent.appendChild(element)
else
parent.insertBefore(element, insertBefore)
end
@openElements.push(element)
else
return insertElementNormal(name, attributes)
end
return element
end
def insertText(data, parent = nil)
parent = @openElements[-1] if parent.nil?
if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)))
parent.insertText(data)
else
#We should be in the InTable mode. This means we want to do
#special magic element rearranging
parent, insertBefore = getTableMisnestedNodePosition
parent.insertText(data, insertBefore)
end
end
# Get the foster parent element, and sibling to insert before
# (or nil) when inserting a misnested table node
def getTableMisnestedNodePosition
#The foster parent element is the one which comes before the most
#recently opened table element
#XXX - this is really inelegant
lastTable = nil
fosterParent = nil
insertBefore = nil
@openElements.reverse.each do |element|
if element.name == "table"
lastTable = element
break
end
end
if lastTable
#XXX - we should really check that this parent is actually a
#node here
if lastTable.parent
fosterParent = lastTable.parent
insertBefore = lastTable
else
fosterParent = @openElements[@openElements.index(lastTable) - 1]
end
else
fosterParent = @openElements[0]
end
return fosterParent, insertBefore
end
def generateImpliedEndTags(exclude = nil)
name = @openElements[-1].name
if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude)
@openElements.pop
# XXX This is not entirely what the specification says. We should
# investigate it more closely.
generateImpliedEndTags(exclude)
end
end
def getDocument
@document
end
def getFragment
#assert @innerHTML
fragment = @fragmentClass.new
@openElements[0].reparentChildren(fragment)
return fragment
end
# Serialize the subtree of node in the format required by unit tests
# node - the node from which to start serializing
def testSerializer(node)
raise NotImplementedError
end
end
end
end
end

View file

@ -0,0 +1,211 @@
require 'html5lib/treebuilders/base'
require 'hpricot'
require 'forwardable'
module HTML5lib
module TreeBuilders
module Hpricot
class Node < Base::Node
extend Forwardable
def_delegators :@hpricot, :name
attr_accessor :hpricot
def initialize(name)
super(name)
@hpricot = self.class.hpricot_class.new name
end
def appendChild(node)
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
else
childNodes << node
hpricot.children << node.hpricot
end
node.parent = self
end
def removeChild(node)
childNodes.delete(node)
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
node.parent = nil
end
def insertText(data, before = nil)
if before
insertBefore(TextNode.new(data), before)
else
appendChild(TextNode.new(data))
end
end
def insertBefore(node, refNode)
index = childNodes.index(refNode)
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
else
childNodes.insert(index, node)
end
end
def hasContent
childNodes.any?
end
end
class Element < Node
def self.hpricot_class
::Hpricot::Elem
end
def initialize(name)
super(name)
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
end
def name
@hpricot.stag.name
end
def cloneNode
attributes.inject(self.class.new(name)) do |node, (name, value)|
node.hpricot[name] = value
node
end
end
# A call to Hpricot::Elem#raw_attributes is built dynamically,
# so alterations to the returned value (a hash) will be lost.
#
# AttributeProxy works around this by forwarding :[]= calls
# to the raw_attributes accessor on the element start tag.
#
class AttributeProxy
def initialize(hpricot)
@hpricot = hpricot
end
def []=(k, v)
@hpricot.stag.send(stag_attributes_method)[k] = v
end
def stag_attributes_method
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
end
def method_missing(*a, &b)
@hpricot.attributes.send(*a, &b)
end
end
def attributes
AttributeProxy.new(@hpricot)
end
def attributes=(attrs)
attrs.each { |name, value| @hpricot[name] = value }
end
def printTree(indent = 0)
tree = "\n|#{' ' * indent}<#{name}>"
indent += 2
attributes.each do |name, value|
next if name == 'xmlns'
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
end
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
end
end
class Document < Node
def self.hpricot_class
::Hpricot::Doc
end
def initialize
super(nil)
end
def printTree(indent = 0)
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
end
end
class DocumentType < Node
def self.hpricot_class
::Hpricot::DocType
end
def initialize(name)
begin
super(name)
rescue ArgumentError # needs 3...
end
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
end
def printTree(indent = 0)
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
end
end
class DocumentFragment < Element
def initialize
super('')
end
def printTree(indent = 0)
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
end
end
class TextNode < Node
def initialize(data)
@hpricot = ::Hpricot::Text.new(data)
end
def printTree(indent = 0)
"\n|#{' ' * indent}\"#{hpricot.content}\""
end
end
class CommentNode < Node
def self.hpricot_class
::Hpricot::Comment
end
def printTree(indent = 0)
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer(node)
node.printTree
end
def getDocument
@document.hpricot
end
def getFragment
@document = super
return @document.hpricot.children
end
end
end
end
end

View file

@ -0,0 +1,191 @@
require 'html5lib/treebuilders/base'
require 'rexml/document'
require 'forwardable'
module HTML5lib
module TreeBuilders
module REXMLTree
class Node < Base::Node
extend Forwardable
def_delegators :@rxobj, :name, :attributes
attr_accessor :rxobj
def initialize name
super name
@rxobj = self.class.rxclass.new name
end
def appendChild node
if node.kind_of? TextNode and
childNodes.length>0 and childNodes[-1].kind_of? TextNode
childNodes[-1].rxobj.value =
childNodes[-1].rxobj.to_s + node.rxobj.to_s
childNodes[-1].rxobj.raw = true
else
childNodes.push node
rxobj.add node.rxobj
end
node.parent = self
end
def removeChild node
childNodes.delete node
rxobj.delete node.rxobj
node.parent = nil
end
def insertText data, before=nil
if before
insertBefore TextNode.new(data), before
else
appendChild TextNode.new(data)
end
end
def insertBefore node, refNode
index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and
childNodes[index-1].kind_of? TextNode
childNodes[index-1].rxobj.value =
childNodes[index-1].rxobj.to_s + node.rxobj.to_s
childNodes[index-1].rxobj.raw = true
else
childNodes.insert index, node
end
end
def hasContent
return (childNodes.length > 0)
end
end
class Element < Node
def self.rxclass
REXML::Element
end
def initialize name
super name
end
def cloneNode
newNode = self.class.new name
attributes.each {|name,value| newNode.attributes[name] = value}
newNode
end
def attributes= value
value.each {|name,value| rxobj.attributes[name]=value}
end
def printTree indent=0
tree = "\n|#{' ' * indent}<#{name}>"
indent += 2
for name, value in attributes
next if name == 'xmlns'
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
end
for child in childNodes
tree += child.printTree(indent)
end
return tree
end
end
class Document < Node
def self.rxclass
REXML::Document
end
def initialize
super nil
end
def appendChild node
if node.kind_of? Element and node.name == 'html'
node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
end
super node
end
def printTree indent=0
tree = "#document"
for child in childNodes
tree += child.printTree(indent + 2)
end
return tree
end
end
class DocumentType < Node
def self.rxclass
REXML::DocType
end
def printTree indent=0
"\n|#{' ' * indent}<!DOCTYPE #{name}>"
end
end
class DocumentFragment < Element
def initialize
super nil
end
def printTree indent=0
tree = ""
for child in childNodes
tree += child.printTree(indent+2)
end
return tree
end
end
class TextNode < Node
def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
@rxobj = REXML::Text.new(raw, true, nil, true)
end
def printTree indent=0
"\n|#{' ' * indent}\"#{rxobj.value}\""
end
end
class CommentNode < Node
def self.rxclass
REXML::Comment
end
def printTree indent=0
"\n|#{' ' * indent}<!-- #{rxobj.string} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
end
def getDocument
@document.rxobj
end
def getFragment
@document = super
return @document.rxobj.children
end
end
end
end
end

View file

@ -0,0 +1,178 @@
require 'html5lib/treebuilders/base'
module HTML5lib
module TreeBuilders
module SimpleTree
class Node < Base::Node
# Node representing an item in the tree.
# name - The tag name associated with the node
attr_accessor :name
# The value of the current node (applies to text nodes and
# comments
attr_accessor :value
# a dict holding name, value pairs for attributes of the node
attr_accessor :attributes
def initialize name
super
@name = name
@value = nil
@attributes = {}
end
def appendChild node
if node.kind_of? TextNode and
childNodes.length>0 and childNodes[-1].kind_of? TextNode
childNodes[-1].value += node.value
else
childNodes.push node
end
node.parent = self
end
def removeChild node
childNodes.delete node
node.parent = nil
end
def cloneNode
newNode = self.class.new name
attributes.each {|name,value| newNode.attributes[name] = value}
newNode.value = value
newNode
end
def insertText data, before=nil
if before
insertBefore TextNode.new(data), before
else
appendChild TextNode.new(data)
end
end
def insertBefore node, refNode
index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and
childNodes[index-1].kind_of? TextNode
childNodes[index-1].value += node.value
else
childNodes.insert index, node
end
end
def printTree indent=0
tree = "\n|%s%s" % [' '* indent, self.to_s]
for child in childNodes
tree += child.printTree(indent + 2)
end
return tree
end
def hasContent
return (childNodes.length > 0)
end
end
class Element < Node
def to_s
"<%s>" % name
end
def printTree indent=0
tree = "\n|%s%s" % [' '* indent, self.to_s]
indent += 2
for name, value in attributes
tree += "\n|%s%s=\"%s\"" % [' ' * indent, name, value]
end
for child in childNodes
tree += child.printTree(indent)
end
return tree
end
end
class Document < Node
def to_s
"#document"
end
def initialize
super nil
end
def printTree indent=0
tree = to_s
for child in childNodes
tree += child.printTree(indent + 2)
end
return tree
end
end
class DocumentType < Node
def to_s
"<!DOCTYPE %s>" % name
end
end
class DocumentFragment < Element
def initialize
super nil
end
def printTree indent=0
tree = ""
for child in childNodes
tree += child.printTree(indent+2)
end
return tree
end
end
class TextNode < Node
def initialize value
super nil
@value = value
end
def to_s
'"%s"' % value
end
end
class CommentNode < Node
def initialize value
super nil
@value = value
end
def to_s
"<!-- %s -->" % value
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
end
def getFragment
@document = super
return @document.childNodes
end
end
end
end
end

View file

@ -0,0 +1,11 @@
require 'test/unit'
HTML5LIB_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
$:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
$:.unshift File.dirname(__FILE__)
def html5lib_test_files(subdirectory)
Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
end

View file

@ -0,0 +1,36 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/inputstream'
class Html5EncodingTestCase < Test::Unit::TestCase
begin
require 'rubygems'
require 'UniversalDetector'
def test_chardet
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.charEncoding.downcase
end
end
rescue LoadError
puts "chardet not found, skipping chardet tests"
end
html5lib_test_files('encoding').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
input, encoding = data.split(/\n#encoding\s+/, 2)
encoding = encoding.split[0]
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.charEncoding.downcase, input
end
end
end
end

212
vendor/plugins/HTML5lib/tests/test_lxp.rb vendored Executable file
View file

@ -0,0 +1,212 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/liberalxmlparser'
XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
SORTATTRS = '<#{$1+$2.split.sort.join(' ')+$3}>'
def assert_xml_equal(input, expected=nil, parser=HTML5lib::XMLParser)
document = parser.parse(input.chomp).root
if not expected
expected = input.chomp.gsub(XMLELEM,SORTATTRS)
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
assert_equal(expected, output)
else
assert_equal(expected, document.to_s.gsub(/'/,'"'))
end
end
def assert_xhtml_equal(input, expected=nil, parser=HTML5lib::XHTMLParser)
assert_xml_equal(input, expected, parser)
end
class BasicXhtml5Test < Test::Unit::TestCase
def test_title_body_mismatched_close
assert_xhtml_equal(
'<title>Xhtml</title><b><i>content</b></i>',
'<html xmlns="http://www.w3.org/1999/xhtml">' +
'<head><title>Xhtml</title></head>' +
'<body><b><i>content</i></b></body>' +
'</html>')
end
def test_title_body_named_charref
assert_xhtml_equal(
'<title>mdash</title>A &mdash B',
'<html xmlns="http://www.w3.org/1999/xhtml">' +
'<head><title>mdash</title></head>' +
'<body>A '+ [0x2014].pack('U') + ' B</body>' +
'</html>')
end
end
class BasicXmlTest < Test::Unit::TestCase
def test_comment
assert_xml_equal("<x><!-- foo --></x>")
end
def test_cdata
assert_xml_equal("<x><![CDATA[foo]]></x>","<x>foo</x>")
end
def test_simple_text
assert_xml_equal("<p>foo</p>","<p>foo</p>")
end
def test_optional_close
assert_xml_equal("<p>foo","<p>foo</p>")
end
def test_html_mismatched
assert_xml_equal("<b><i>foo</b></i>","<b><i>foo</i></b>")
end
end
class OpmlTest < Test::Unit::TestCase
def test_mixedCaseElement
assert_xml_equal(
'<opml version="1.0">' +
'<head><ownerName>Dave Winer</ownerName></head>' +
'</opml>')
end
def test_mixedCaseAttribute
assert_xml_equal(
'<opml version="1.0">' +
'<body><outline isComment="true"/></body>' +
'</opml>')
end
def test_malformed
assert_xml_equal(
'<opml version="1.0">' +
'<body><outline text="Odds & Ends"/></body>' +
'</opml>',
'<opml version="1.0">' +
'<body><outline text="Odds &amp; Ends"/></body>' +
'</opml>')
end
end
class XhtmlTest < Test::Unit::TestCase
def test_mathml
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>MathML</title></head>
<body>
<math xmlns="http://www.w3.org/1998/Math/MathML">
<mrow>
<mi>x</mi>
<mo>=</mo>
<mfrac>
<mrow>
<mrow>
<mo>-</mo>
<mi>b</mi>
</mrow>
<mo>&#177;</mo>
<msqrt>
<mrow>
<msup>
<mi>b</mi>
<mn>2</mn>
</msup>
<mo>-</mo>
<mrow>
<mn>4</mn>
<mo>&#8290;</mo>
<mi>a</mi>
<mo>&#8290;</mo>
<mi>c</mi>
</mrow>
</mrow>
</msqrt>
</mrow>
<mrow>
<mn>2</mn>
<mo>&#8290;</mo>
<mi>a</mi>
</mrow>
</mfrac>
</mrow>
</math>
</body></html>
EOX
end
def test_svg
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>SVG</title></head>
<body>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
<path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
</path>
<circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
</circle>
</svg>
</body></html>
EOX
end
def test_xlink
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>XLINK</title></head>
<body>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
<defs xmlns:l="http://www.w3.org/1999/xlink">
<radialGradient id="s1" fx=".4" fy=".2" r=".7">
<stop stop-color="#FE8"/>
<stop stop-color="#D70" offset="1"/>
</radialGradient>
<radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
<radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
<radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
</defs>
<g stroke="#940">
<path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
<path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
<path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
<path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
<path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
<path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
</g>
</svg>
</body></html>
EOX
end
def test_br
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>XLINK</title></head>
<body>
<br/>
</body></html>
EOX
end
def xtest_strong
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>XLINK</title></head>
<body>
<strong></strong>
</body></html>
EOX
end
end

View file

@ -0,0 +1,108 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/treebuilders'
require 'html5lib/html5parser'
$tree_types_to_test = ['simpletree', 'rexml']
begin
require 'hpricot'
$tree_types_to_test.push('hpricot')
rescue LoadError
end
$CHECK_PARSER_ERRORS = false
puts 'Testing: ' + $tree_types_to_test * ', '
class Html5ParserTestCase < Test::Unit::TestCase
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
end
html5lib_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
$tree_types_to_test.each do |tree_name|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
if innerHTML
parser.parseFragment(input, innerHTML)
else
parser.parse(input)
end
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
assert_equal sortattrs(expected_output), sortattrs(actual_output), [
'Input:', input,
'Expected:', expected_output,
'Recieved:', actual_output
].join("\n")
if $CHECK_PARSER_ERRORS
actual_errors = parser.errors.map do |(line, col), message|
'Line: %i Col: %i %s' % [line, col, message]
end
assert_equal parser.errors.length, expected_errors.length, [
'Expected errors:', expected_errors.join("\n"),
'Actual errors:', actual_errors.join("\n")
].join("\n")
end
end
end
end
end
end

View file

@ -0,0 +1,206 @@
#!/usr/bin/env ruby
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/sanitizer'
require 'html5lib/html5parser'
require 'html5lib/liberalxmlparser'
class SanitizeTest < Test::Unit::TestCase
include HTML5lib
def sanitize_xhtml stream
XHTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
end
def sanitize_html stream
HTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
end
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
next if %w[caption col colgroup optgroup option table tbody td tfoot th thead tr].include?(tag_name) ### TODO
define_method "test_should_allow_#{tag_name}_tag" do
if tag_name == 'image'
assert_equal "<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
elsif VOID_ELEMENTS.include?(tag_name)
assert_equal "<#{tag_name} title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
else
assert_equal "<#{tag_name.downcase} title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>",
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
assert_equal "<#{tag_name} title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>",
sanitize_xhtml("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
end
end
end
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
assert_equal "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;",
sanitize_html("<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>")
end
end
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
next if attribute_name == 'style'
define_method "test_should_allow_#{attribute_name}_attribute" do
assert_equal "<p #{attribute_name.downcase}=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
sanitize_html("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
assert_equal "<p #{attribute_name}=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
sanitize_xhtml("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
end
end
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
assert_equal "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
sanitize_html("<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>")
end
end
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_#{protocol}_uris" do
assert_equal "<a href=\"#{protocol}\">foo</a>",
sanitize_html(%(<a href="#{protocol}">foo</a>))
end
end
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
assert_equal "<a href=\"#{protocol.upcase}\">foo</a>",
sanitize_html(%(<a href="#{protocol.upcase}">foo</a>))
end
end
def test_should_allow_anchors
assert_equal "<a href=\"foo\">&lt;script&gt;baz&lt;/script&gt;</a>",
sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>")
end
# RFC 3986, sec 4.2
def test_allow_colons_in_path_component
assert_equal "<a href=\"./this:that\">foo</a>",
sanitize_html("<a href=\"./this:that\">foo</a>")
end
%w(src width height alt).each do |img_attr|
define_method "test_should_allow_image_#{img_attr}_attribute" do
assert_equal "<img #{img_attr}=\"foo\"/>",
sanitize_html("<img #{img_attr}='foo' onclick='bar' />")
end
end
def test_should_handle_non_html
assert_equal 'abc', sanitize_html("abc")
end
def test_should_handle_blank_text
assert_equal '', sanitize_html('')
end
[%w(img src), %w(a href)].each do |(tag, attr)|
close = VOID_ELEMENTS.include?(tag) ? "/>boo" : ">boo</#{tag}>"
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo</#{tag}>))
end
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo</#{tag}>))
end
end
[%(<img src="javascript:alert('XSS');" />),
%(<img src=javascript:alert('XSS') />),
%(<img src="JaVaScRiPt:alert('XSS')" />),
%(<img src='javascript:alert(&quot;XSS&quot;)' />),
%(<img src='javascript:alert(String.fromCharCode(88,83,83))' />),
%(<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />),
%(<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />),
%(<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />),
%(<img src="jav\tascript:alert('XSS');" />),
%(<img src="jav&#x09;ascript:alert('XSS');" />),
%(<img src="jav&#x0A;ascript:alert('XSS');" />),
%(<img src="jav&#x0D;ascript:alert('XSS');" />),
%(<img src=" &#14; javascript:alert('XSS');" />),
%(<img src="&#x20;javascript:alert('XSS');" />),
%(<img src="&#xA0;javascript:alert('XSS');" />)].each_with_index do |img_hack, i|
define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
assert_equal "<img/>", sanitize_html(img_hack)
end
end
def test_should_sanitize_tag_broken_up_by_null
assert_equal "&lt;scr\357\277\275ipt&gt;alert(\"XSS\")&lt;/scr\357\277\275ipt&gt;", sanitize_html(%(<scr\0ipt>alert(\"XSS\")</scr\0ipt>))
end
def test_should_sanitize_invalid_script_tag
assert_equal "&lt;script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;", sanitize_html(%(<script/XSS SRC="http://ha.ckers.org/xss.js"></script>))
end
def test_should_sanitize_script_tag_with_multiple_open_brackets
assert_equal "&lt;&lt;script&gt;alert(\"XSS\");//&lt;&lt;/script&gt;", sanitize_html(%(<<script>alert("XSS");//<</script>))
assert_equal %(&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;), sanitize_html(%(<iframe src=http://ha.ckers.org/scriptlet.html\n<))
end
def test_should_sanitize_unclosed_script
assert_equal "&lt;script src=\"http://ha.ckers.org/xss.js?\"&gt;<b/>", sanitize_html(%(<script src=http://ha.ckers.org/xss.js?<b>))
end
def test_should_sanitize_half_open_scripts
assert_equal "<img/>", sanitize_html(%(<img src="javascript:alert('XSS')"))
end
def test_should_not_fall_for_ridiculous_hack
img_hack = %(<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />)
assert_equal "<img/>", sanitize_html(img_hack)
end
def test_platypus
assert_equal %(<a href=\"http://www.ragingplatypus.com/\" style=\"display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;\">never trust your upstream platypus</a>),
sanitize_html(%(<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>))
end
def test_xul
assert_equal %(<p style="">fubar</p>),
sanitize_html(%(<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>))
end
def test_input_image
assert_equal %(<input type="image"/>),
sanitize_html(%(<input type="image" src="javascript:alert('XSS');" />))
end
def test_non_alpha_non_digit
assert_equal "&lt;script XSS=\"\" src=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;",
sanitize_html(%(<script/XSS src="http://ha.ckers.org/xss.js"></script>))
assert_equal "<a>foo</a>",
sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>')
assert_equal "<img src=\"http://ha.ckers.org/xss.js\"/>",
sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>')
end
def test_img_dynsrc_lowsrc
assert_equal "<img/>",
sanitize_html(%(<img dynsrc="javascript:alert('XSS')" />))
assert_equal "<img/>",
sanitize_html(%(<img lowsrc="javascript:alert('XSS')" />))
end
def test_div_background_image_unicode_encoded
assert_equal '<div style="">foo</div>',
sanitize_html(%(<div style="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">foo</div>))
end
def test_div_expression
assert_equal '<div style="">foo</div>',
sanitize_html(%(<div style="width: expression(alert('XSS'));">foo</div>))
end
def test_img_vbscript
assert_equal '<img/>',
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
end
end

View file

@ -0,0 +1,78 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/tokenizer'
require 'tokenizer_test_parser'
begin
require 'jsonx'
rescue LoadError
class JSON
def self.parse json
json.gsub! /"\s*:/, '"=>'
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
eval json
end
end
end
class Html5TokenizerTestCase < Test::Unit::TestCase
def type_of?(token_name, token)
token != 'ParseError' and token_name == token.first
end
def convert_attribute_arrays_to_hashes(tokens)
tokens.inject([]) do |tokens, token|
token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
tokens << token
end
end
def concatenate_consecutive_characters(tokens)
tokens.inject([]) do |tokens, token|
if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
tokens.last[1] = tokens.last[1] + token[1]
next tokens
end
tokens << token
end
end
def tokenizer_test(data)
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
message = [
'Description:', data['description'],
'Input:', data['input'],
'Content Model Flag:', content_model_flag ] * "\n"
assert_nothing_raised message do
tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
tokenizer.contentModelFlag = content_model_flag.to_sym
tokenizer.currentToken = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
tokens = TokenizerTestParser.new(tokenizer).parse
actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
expected = concatenate_consecutive_characters(data['output'])
assert_equal expected, actual, message
end
end
end
html5lib_test_files('tokenizer').each do |test_file|
test_name = File.basename(test_file).sub('.test', '')
tests = JSON.parse(File.read(test_file))['tests']
tests.each_with_index do |data, index|
define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
end
end
end

View file

@ -0,0 +1,62 @@
require 'html5lib/constants'
class TokenizerTestParser
def initialize(tokenizer)
@tokenizer = tokenizer
end
def parse
@outputTokens = []
debug = nil
for token in @tokenizer
debug = token.inspect if token[:type] == :ParseError
send ('process' + token[:type].to_s), token
end
return @outputTokens
end
def processDoctype(token)
@outputTokens.push(["DOCTYPE", token[:name], token[:data]])
end
def processStartTag(token)
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processEmptyTag(token)
if not HTML5lib::VOID_ELEMENTS.include? token[:name]
@outputTokens.push("ParseError")
end
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processEndTag(token)
if token[:data].length > 0
self.processParseError(token)
end
@outputTokens.push(["EndTag", token[:name]])
end
def processComment(token)
@outputTokens.push(["Comment", token[:data]])
end
def processCharacters(token)
@outputTokens.push(["Character", token[:data]])
end
alias processSpaceCharacters processCharacters
def processCharacters(token)
@outputTokens.push(["Character", token[:data]])
end
def processEOF(token)
end
def processParseError(token)
@outputTokens.push("ParseError")
end
end

View file

@ -31,6 +31,9 @@ Globals = {
:maruku_signature => false, :maruku_signature => false,
:code_background_color => '#fef', :code_background_color => '#fef',
:code_show_spaces => false, :code_show_spaces => false,
:filter_html => false,
:html_math_output_mathml => true, # also set :html_math_engine :html_math_output_mathml => true, # also set :html_math_engine
:html_math_engine => 'itex2mml', #ritex, itex2mml, none :html_math_engine => 'itex2mml', #ritex, itex2mml, none

View file

@ -477,7 +477,7 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
end end
id = match[1]; url = match[2]; title = match[3]; id = match[1]; url = match[2]; title = match[3];
id = id.strip.downcase.gsub(' ','_') id = sanitize_ref_id(id)
hash = self.refs[id] = {:url=>url,:title=>title} hash = self.refs[id] = {:url=>url,:title=>title}

View file

@ -287,7 +287,7 @@ module MaRuKu; module In; module Markdown; module SpanLevelParser
end end
def extension_meta(src, con, break_on_chars) def extension_meta(src, con, break_on_chars)
if m = src.read_regexp(/([^\s\:]+):/) if m = src.read_regexp(/([^\s\:\"\']+):/)
name = m[1] name = m[1]
al = read_attribute_list(src, con, break_on_chars) al = read_attribute_list(src, con, break_on_chars)
# puts "#{name}=#{al.inspect}" # puts "#{name}=#{al.inspect}"
@ -581,9 +581,9 @@ module MaRuKu; module In; module Markdown; module SpanLevelParser
ref_id = read_ref_id(src,con) ref_id = read_ref_id(src,con)
if ref_id if ref_id
if ref_id.size == 0 if ref_id.size == 0
ref_id = children.to_s.downcase.gsub(' ','_') ref_id = sanitize_ref_id(children.to_s)
else else
ref_id = ref_id.downcase ref_id = sanitize_ref_id(ref_id)
end end
con.push_element md_link(children, ref_id) con.push_element md_link(children, ref_id)
else else

View file

@ -108,6 +108,7 @@ module MaRuKu
# Input is a LineSource # Input is a LineSource
def t2_parse_blocks(src, output) def t2_parse_blocks(src, output)
while src.cur_line while src.cur_line
l = src.shift_line
# ignore empty line # ignore empty line
if l.t2_empty? then if l.t2_empty? then
@ -115,7 +116,6 @@ module MaRuKu
next next
end end
l = src.shift_line
# TODO: lists # TODO: lists
# TODO: xml # TODO: xml
# TODO: `==` # TODO: `==`

View file

@ -741,7 +741,17 @@ of the form `#ff00ff`.
return a return a
end end
=begin maruku_doc
Attribute: filter_html
Scope: document
If true, raw HTML is discarded from the output.
=end
def to_html_raw_html def to_html_raw_html
return [] if get_setting(:filter_html)
raw_html = self.raw_html raw_html = self.raw_html
if rexml_doc = @parsed_html if rexml_doc = @parsed_html
root = rexml_doc.root root = rexml_doc.root

View file

@ -152,7 +152,7 @@ end end
module MaRuKu; module Out; module Latex module MaRuKu; module Out; module Latex
def to_latex_hrule; "\n\\vspace{.5em} \\hrule \\vspace{.5em}\n" end def to_latex_hrule; "\n\\vspace{.5em} \\hrule \\vspace{.5em}\n" end
def to_latex_linebreak; "\\linebreak " end def to_latex_linebreak; "\\newline " end
def to_latex_paragraph def to_latex_paragraph
children_to_latex+"\n\n" children_to_latex+"\n\n"

View file

@ -146,6 +146,10 @@ module MaRuKu; module Strings
s[0, i+1].strip s[0, i+1].strip
end end
def sanitize_ref_id(x)
x.downcase.gsub(' ','_').gsub(/[^\w]/,'')
end
# removes initial quote # removes initial quote
def unquote(s) def unquote(s)

View file

@ -155,7 +155,7 @@ module MaRuKu; module Tests
["[a]", [ md_link(["a"],'a')], 'Empty link'], ["[a]", [ md_link(["a"],'a')], 'Empty link'],
["[a][]", ], ["[a][]", ],
["[a][]b", [ md_link(["a"],'a'),'b'], 'Empty link'], ["[a][]b", [ md_link(["a"],'a'),'b'], 'Empty link'],
["[a\\]][]", [ md_link(["a]"],'a]')], 'Escape inside link'], ["[a\\]][]", [ md_link(["a]"],'a')], 'Escape inside link (throw ?] away)'],
["[a", :throw, 'Link not closed'], ["[a", :throw, 'Link not closed'],
["[a][", :throw, 'Ref not closed'], ["[a][", :throw, 'Ref not closed'],

View file

@ -19,7 +19,7 @@
#++ #++
module MaRuKu module MaRuKu
Version = '0.5.5' Version = '0.5.6'
MarukuURL = 'http://maruku.rubyforge.org/' MarukuURL = 'http://maruku.rubyforge.org/'