Synced with trunk.

This commit is contained in:
Jason Blevins 2007-06-22 13:21:49 -04:00
commit 3070d6eeae
91 changed files with 44709 additions and 3658 deletions

View file

@ -1,7 +1,7 @@
# Controller responsible for serving files and pictures.
require 'zip/zip'
require 'string_utils'
require 'sanitize'
class FileController < ApplicationController

View file

@ -1,9 +1,9 @@
require 'fileutils'
require 'redcloth_for_tex'
#require 'redcloth_for_tex'
require 'maruku'
require 'parsedate'
require 'zip/zip'
require 'sanitize'
require 'string_utils'
class WikiController < ApplicationController
@ -11,7 +11,7 @@ class WikiController < ApplicationController
caches_action :show, :published, :authors, :tex, :s5, :print, :recently_revised, :list, :atom_with_content, :atom_with_headlines
cache_sweeper :revision_sweeper
layout 'default', :except => [:atom_with_content, :atom_with_headlines, :atom, :tex, :pdf, :s5, :export_tex, :export_html]
layout 'default', :except => [:atom_with_content, :atom_with_headlines, :atom, :tex, :s5, :export_html]
include Sanitize
@ -95,21 +95,21 @@ class WikiController < ApplicationController
export_pages_as_zip(@web.markup) { |page| page.content }
end
def export_pdf
file_name = "#{@web.address}-tex-#{@web.revised_at.strftime('%Y-%m-%d-%H-%M-%S')}"
file_path = File.join(@wiki.storage_path, file_name)
# def export_pdf
# file_name = "#{@web.address}-tex-#{@web.revised_at.strftime('%Y-%m-%d-%H-%M-%S')}"
# file_path = File.join(@wiki.storage_path, file_name)
#
# export_web_to_tex "#{file_path}.tex" unless FileTest.exists? "#{file_path}.tex"
# convert_tex_to_pdf "#{file_path}.tex"
# send_file "#{file_path}.pdf"
# end
export_web_to_tex "#{file_path}.tex" unless FileTest.exists? "#{file_path}.tex"
convert_tex_to_pdf "#{file_path}.tex"
send_file "#{file_path}.pdf"
end
def export_tex
file_name = "#{@web.address}-tex-#{@web.revised_at.strftime('%Y-%m-%d-%H-%M-%S')}.tex"
file_path = File.join(@wiki.storage_path, file_name)
export_web_to_tex(file_path) unless FileTest.exists?(file_path)
send_file file_path
end
# def export_tex
# file_name = "#{@web.address}-tex-#{@web.revised_at.strftime('%Y-%m-%d-%H-%M-%S')}.tex"
# file_path = File.join(@wiki.storage_path, file_name)
# export_web_to_tex(file_path) unless FileTest.exists?(file_path)
# send_file file_path
# end
def feeds
@rss_with_content_allowed = rss_with_content_allowed?
@ -180,17 +180,17 @@ class WikiController < ApplicationController
# to template
end
def pdf
page = wiki.read_page(@web_name, @page_name)
safe_page_name = @page.name.gsub(/\W/, '')
file_name = "#{safe_page_name}-#{@web.address}-#{@page.revised_at.strftime('%Y-%m-%d-%H-%M-%S')}"
file_path = File.join(@wiki.storage_path, file_name)
export_page_to_tex("#{file_path}.tex") unless FileTest.exists?("#{file_path}.tex")
# NB: this is _very_ slow
convert_tex_to_pdf("#{file_path}.tex")
send_file "#{file_path}.pdf"
end
# def pdf
# page = wiki.read_page(@web_name, @page_name)
# safe_page_name = @page.name.gsub(/\W/, '')
# file_name = "#{safe_page_name}-#{@web.address}-#{@page.revised_at.strftime('%Y-%m-%d-%H-%M-%S')}"
# file_path = File.join(@wiki.storage_path, file_name)
#
# export_page_to_tex("#{file_path}.tex") unless FileTest.exists?("#{file_path}.tex")
# # NB: this is _very_ slow
# convert_tex_to_pdf("#{file_path}.tex")
# send_file "#{file_path}.pdf"
# end
def print
if @page.nil?
@ -285,10 +285,10 @@ class WikiController < ApplicationController
end
def tex
if @web.markup == :markdownMML
if @web.markup == :markdownMML or @web.markup == :markdown
@tex_content = Maruku.new(@page.content).to_latex
else
@tex_content = RedClothForTex.new(@page.content).to_tex
@tex_content = 'TeX export only supported with the Markdown text filters.'
end
end
@ -315,23 +315,23 @@ class WikiController < ApplicationController
private
def convert_tex_to_pdf(tex_path)
# TODO remove earlier PDF files with the same prefix
# TODO handle gracefully situation where pdflatex is not available
begin
wd = Dir.getwd
Dir.chdir(File.dirname(tex_path))
logger.info `pdflatex --interaction=nonstopmode #{File.basename(tex_path)}`
ensure
Dir.chdir(wd)
end
end
# def convert_tex_to_pdf(tex_path)
# # TODO remove earlier PDF files with the same prefix
# # TODO handle gracefully situation where pdflatex is not available
# begin
# wd = Dir.getwd
# Dir.chdir(File.dirname(tex_path))
# logger.info `pdflatex --interaction=nonstopmode #{File.basename(tex_path)}`
# ensure
# Dir.chdir(wd)
# end
# end
def export_page_to_tex(file_path)
if @web.markup == :markdownMML
@tex_content = Maruku.new(@page.content).to_latex
else
@tex_content = RedClothForTex.new(@page.content).to_tex
@tex_content = 'TeX export only supported with the Markdown text filters.'
end
File.open(file_path, 'w') { |f| f.write(render_to_string(:template => 'wiki/tex', :layout => 'tex')) }
end
@ -360,15 +360,15 @@ class WikiController < ApplicationController
send_file file_path
end
def export_web_to_tex(file_path)
# def export_web_to_tex(file_path)
# if @web.markup == :markdownMML
# @tex_content = Maruku.new(@page.content).to_latex
# else
# @tex_content = RedClothForTex.new(@page.content).to_tex
# @tex_content = 'TeX export only supported with the Markdown text filters.'
# end
@tex_content = table_of_contents(@web.page('HomePage').content, render_tex_web)
File.open(file_path, 'w') { |f| f.write(render_to_string(:template => 'wiki/tex_web', :layout => tex)) }
end
# @tex_content = table_of_contents(@web.page('HomePage').content, render_tex_web)
# File.open(file_path, 'w') { |f| f.write(render_to_string(:template => 'wiki/tex_web', :layout => tex)) }
# end
def get_page_and_revision
if params['rev']
@ -411,7 +411,7 @@ class WikiController < ApplicationController
if @web.markup == :markdownMML
tex_web[page.name] = Maruku.new(page.content).to_latex
else
tex_web[page.name] = RedClothForTex.new(page.content).to_tex
tex_web[page.name] = 'TeX export only supported with the Markdown text filters.'
end
tex_web
end

View file

@ -13,7 +13,7 @@
{ 'id' => 'editForm', 'method' => 'post', 'onsubmit' => 'cleanAuthorName()',
'accept-charset' => 'utf-8' }) do %>
<div>
<textarea name="content" id="content" rows="24" cols="60"><%= h(@flash[:content] || @page.content.delete("\x01-\x08\x0B\x0C\x0E-\x1F")) %></textarea>
<textarea name="content" id="content" rows="24" cols="60"><%= h(flash[:content] || @page.content.delete("\x01-\x08\x0B\x0C\x0E-\x1F")) %></textarea>
<div id="editFormButtons">
<input type="submit" value="Submit" accesskey="s"/> as
<%= text_field_tag :author, h(@author.delete("\x01-\x08\x0B\x0C\x0E-\x1F")),

View file

@ -5,8 +5,4 @@
<ul id="feedsList">
<li><%= link_to 'HTML', :web => @web.address, :action => 'export_html' %></li>
<li><%= link_to "Markup (#{@web.markup.to_s.capitalize})", :web => @web.address, :action => 'export_markup' %></li>
<% if OPTIONS[:pdflatex] and @web.markup == :textile || @web.markup == :markdownMML %>
<li><%= link_to 'TeX', :web => @web.address, :action => 'export_tex' %></li>
<li><%= link_to 'PDF', :web => @web.address, :action => 'export_pdf' %></li>
<% end %>
</ul>

View file

@ -13,7 +13,7 @@
<% form_tag({ :action => 'save', :web => @web.address, :id => @page_name },
{ 'id' => 'editForm', 'method' => 'post', 'onsubmit' => 'cleanAuthorName();', 'accept-charset' => 'utf-8' }) do %>
<textarea name="content" id="content" rows="24" cols="60"><%= h(@flash[:content] || '') %></textarea>
<textarea name="content" id="content" rows="24" cols="60"><%= h(flash[:content] || '') %></textarea>
<div id="editFormButtons">
<input type="submit" value="Submit" accesskey="s"/> as
<%= text_field_tag :author, @author,

View file

@ -35,15 +35,10 @@
<%= link_to('Print',
{ :web => @web.address, :action => 'print', :id => @page.name },
{ :accesskey => 'p', :id => 'view_print' }) %>
<% if defined? RedClothForTex and RedClothForTex.available? and @web.markup == :textile or @web.markup == :markdownMML %>
<% if @web.markup == :markdownMML or @web.markup == :markdown %>
|
<%= link_to 'TeX', {:web => @web.address, :action => 'tex', :id => @page.name},
{:id => 'view_tex'} %>
<% if OPTIONS[:pdflatex] %>
|
<%= link_to 'PDF', {:web => @web.address, :action => 'pdf', :id => @page.name},
{:id => 'view_pdf'} %>
<% end %>
<% if WikiReference.pages_in_category(@web, 'S5-slideshow').map.include?(@page.name) %>
|
<%= link_to 'S5', {:web => @web.address, :action => 's5', :id => @page.name},

View file

@ -2,11 +2,17 @@
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{graphicx}
\usepackage{ucs}
\usepackage[utf8x]{inputenc}
\usepackage{hyperref}
%----Macros----------
\newcommand{\gt}{>}
\newcommand{\lt}{<}
\newcommand{\qed}{\blacksquare}
%-------------------------------------------------------------------
\begin{document}

View file

@ -24,10 +24,10 @@ module Engines
end
class Textile < AbstractEngine
require_dependency 'sanitize'
require 'sanitize'
include Sanitize
def mask
require_dependency 'redcloth'
require 'redcloth'
redcloth = RedCloth.new(@content, [:hard_breaks] + @content.options[:engine_opts])
redcloth.filter_html = false
redcloth.no_span_caps = false
@ -37,33 +37,34 @@ module Engines
end
class Markdown < AbstractEngine
require_dependency 'sanitize'
require 'sanitize'
include Sanitize
def mask
require_dependency 'maruku'
require_dependency 'maruku/ext/math'
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), {:math_enabled => false}).to_html
sanitize_xhtml(html.to_ncr)
require 'maruku'
require 'maruku/ext/math'
html = sanitize_rexml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
{:math_enabled => false}).to_html_tree)
html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
end
end
class MarkdownMML < AbstractEngine
require_dependency 'sanitize'
require 'sanitize'
include Sanitize
def mask
require_dependency 'maruku'
require_dependency 'maruku/ext/math'
html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html
sanitize_xhtml(html.to_ncr)
require 'maruku'
require 'maruku/ext/math'
html = sanitize_rexml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html_tree)
html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
end
end
class Mixed < AbstractEngine
require_dependency 'sanitize'
require 'sanitize'
include Sanitize
def mask
require_dependency 'redcloth'
require 'redcloth'
redcloth = RedCloth.new(@content, @content.options[:engine_opts])
redcloth.filter_html = false
redcloth.no_span_caps = false
@ -73,7 +74,7 @@ module Engines
end
class RDoc < AbstractEngine
require_dependency 'sanitize'
require 'sanitize'
include Sanitize
def mask
require_dependency 'rdocsupport'

View file

@ -1,4 +1,5 @@
require 'xhtmldiff'
# Temporary class containing all rendering stuff from a Revision
# I want to shift all rendering loguc to the controller eventually
@ -40,10 +41,12 @@ class PageRenderer
previous_revision = @revision.page.previous_revision(@revision)
if previous_revision
previous_content = "<div>\n" + WikiContent.new(previous_revision, @@url_generator).render!.to_s + "\n</div>"
current_content = "<div>\n" + display_content.to_s + "\n</div>"
previous_content = "<div>" + WikiContent.new(previous_revision, @@url_generator).render!.to_s + "</div>"
current_content = "<div>" + display_content.to_s + "</div>"
diff_doc = REXML::Document.new
diff_doc << (div = REXML::Element.new 'div')
div = REXML::Element.new('div', nil, {:respect_whitespace =>:all})
div.attributes['class'] = 'xhtmldiff_wrapper'
diff_doc << div
hd = XHTMLDiff.new(div)
parsed_previous_revision = REXML::HashableElementDelegator.new(
@ -54,7 +57,7 @@ class PageRenderer
diffs = ''
diff_doc.write(diffs, -1, true, true)
diffs
diffs.gsub(/\A<div class='xhtmldiff_wrapper'>(.*)<\/div>\Z/m, '\1')
else
display_content
end

View file

@ -1,736 +0,0 @@
# This is RedCloth (http://www.whytheluckystiff.net/ruby/redcloth/)
# converted by David Heinemeier Hansson to emit Tex
class String
# Flexible HTML escaping
def texesc!( mode )
gsub!( '&', '\\\\&' )
gsub!( '%', '\%' )
gsub!( '$', '\$' )
gsub!( '~', '$\sim$' )
end
end
def table_of_contents(text, pages)
text.gsub( /^([#*]+? .*?)$(?![^#*])/m ) do |match|
lines = match.split( /\n/ )
last_line = -1
depth = []
lines.each_with_index do |line, line_id|
if line =~ /^([#*]+) (.*)$/m
tl,content = $~[1..2]
content.gsub! /[\[\]]/, ""
content.strip!
if depth.last
if depth.last.length > tl.length
(depth.length - 1).downto(0) do |i|
break if depth[i].length == tl.length
lines[line_id - 1] << "" # "\n\t\\end{#{ lT( depth[i] ) }}\n\t"
depth.pop
end
end
if !depth.last.nil? && !tl.length.nil? && depth.last.length == tl.length
lines[line_id - 1] << ''
end
end
depth << tl unless depth.last == tl
subsection_depth = [depth.length - 1, 2].min
lines[line_id] = "\n\\#{ "sub" * subsection_depth }section{#{ content }}"
lines[line_id] += "\n#{pages[content]}" if pages.keys.include?(content)
lines[line_id] = "\\pagebreak\n#{lines[line_id]}" if subsection_depth == 0
last_line = line_id
elsif line =~ /^\s+\S/
last_line = line_id
elsif line_id - last_line < 2 and line =~ /^\S/
last_line = line_id
end
if line_id - last_line > 1 or line_id == lines.length - 1
depth.delete_if do |v|
lines[last_line] << "" # "\n\t\\end{#{ lT( v ) }}"
end
end
end
lines.join( "\n" )
end
end
class RedClothForTex < String
VERSION = '2.0.7'
#
# Mapping of 8-bit ASCII codes to HTML numerical entity equivalents.
# (from PyTextile)
#
TEXTILE_TAGS =
[[128, 8364], [129, 0], [130, 8218], [131, 402], [132, 8222], [133, 8230],
[134, 8224], [135, 8225], [136, 710], [137, 8240], [138, 352], [139, 8249],
[140, 338], [141, 0], [142, 0], [143, 0], [144, 0], [145, 8216], [146, 8217],
[147, 8220], [148, 8221], [149, 8226], [150, 8211], [151, 8212], [152, 732],
[153, 8482], [154, 353], [155, 8250], [156, 339], [157, 0], [158, 0], [159, 376]].
collect! do |a, b|
[a.chr, ( b.zero? and "" or "&#{ b };" )]
end
#
# Regular expressions to convert to HTML.
#
A_HLGN = /(?:(?:<>|<|>|\=|[()]+)+)/
A_VLGN = /[\-^~]/
C_CLAS = '(?:\([^)]+\))'
C_LNGE = '(?:\[[^\]]+\])'
C_STYL = '(?:\{[^}]+\})'
S_CSPN = '(?:\\\\\d+)'
S_RSPN = '(?:/\d+)'
A = "(?:#{A_HLGN}?#{A_VLGN}?|#{A_VLGN}?#{A_HLGN}?)"
S = "(?:#{S_CSPN}?#{S_RSPN}|#{S_RSPN}?#{S_CSPN}?)"
C = "(?:#{C_CLAS}?#{C_STYL}?#{C_LNGE}?|#{C_STYL}?#{C_LNGE}?#{C_CLAS}?|#{C_LNGE}?#{C_STYL}?#{C_CLAS}?)"
# PUNCT = Regexp::quote( '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' )
PUNCT = Regexp::quote( '!"#$%&\'*+,-./:;=?@\\^_`|~' )
HYPERLINK = '(\S+?)([^\w\s/;=\?]*?)(\s|$)'
GLYPHS = [
# [ /([^\s\[{(>])?\'([dmst]\b|ll\b|ve\b|\s|:|$)/, '\1&#8217;\2' ], # single closing
[ /([^\s\[{(>])\'/, '\1&#8217;' ], # single closing
[ /\'(?=\s|s\b|[#{PUNCT}])/, '&#8217;' ], # single closing
[ /\'/, '&#8216;' ], # single opening
# [ /([^\s\[{(])?"(\s|:|$)/, '\1&#8221;\2' ], # double closing
[ /([^\s\[{(>])"/, '\1&#8221;' ], # double closing
[ /"(?=\s|[#{PUNCT}])/, '&#8221;' ], # double closing
[ /"/, '&#8220;' ], # double opening
[ /\b( )?\.{3}/, '\1&#8230;' ], # ellipsis
[ /\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])/, '<acronym title="\2">\1</acronym>' ], # 3+ uppercase acronym
[ /(^|[^"][>\s])([A-Z][A-Z0-9 ]{2,})([^<a-z0-9]|$)/, '\1<span class="caps">\2</span>\3' ], # 3+ uppercase caps
[ /(\.\s)?\s?--\s?/, '\1&#8212;' ], # em dash
[ /\s->\s/, ' &rarr; ' ], # en dash
[ /\s-\s/, ' &#8211; ' ], # en dash
[ /(\d+) ?x ?(\d+)/, '\1&#215;\2' ], # dimension sign
[ /\b ?[(\[]TM[\])]/i, '&#8482;' ], # trademark
[ /\b ?[(\[]R[\])]/i, '&#174;' ], # registered
[ /\b ?[(\[]C[\])]/i, '&#169;' ] # copyright
]
I_ALGN_VALS = {
'<' => 'left',
'=' => 'center',
'>' => 'right'
}
H_ALGN_VALS = {
'<' => 'left',
'=' => 'center',
'>' => 'right',
'<>' => 'justify'
}
V_ALGN_VALS = {
'^' => 'top',
'-' => 'middle',
'~' => 'bottom'
}
QTAGS = [
['**', 'bf'],
['*', 'bf'],
['??', 'cite'],
['-', 'del'],
['__', 'underline'],
['_', 'em'],
['%', 'span'],
['+', 'ins'],
['^', 'sup'],
['~', 'sub']
]
def self.available?
if not defined? @@available
begin
@@available = system "pdflatex -version"
rescue Errno::ENOENT
@@available = false
end
end
@@available
end
#
# Two accessor for setting security restrictions.
#
# This is a nice thing if you're using RedCloth for
# formatting in public places (e.g. Wikis) where you
# don't want users to abuse HTML for bad things.
#
# If +:filter_html+ is set, HTML which wasn't
# created by the Textile processor will be escaped.
#
# If +:filter_styles+ is set, it will also disable
# the style markup specifier. ('{color: red}')
#
attr_accessor :filter_html, :filter_styles
#
# Accessor for toggling line folding.
#
# If +:fold_lines+ is set, single newlines will
# not be converted to break tags.
#
attr_accessor :fold_lines
def initialize( string, restrictions = [] )
restrictions.each { |r| method( "#{ r }=" ).call( true ) }
super( string )
end
#
# Generate tex.
#
def to_tex( lite = false )
# make our working copy
text = self.dup
@urlrefs = {}
@shelf = []
# incoming_entities text
fix_entities text
clean_white_space text
get_refs text
no_textile text
unless lite
lists text
table text
end
glyphs text
unless lite
fold text
block text
end
retrieve text
encode_entities text
text.gsub!(/\[\[(.*?)\]\]/, "\\1")
text.gsub!(/_/, "\\_")
text.gsub!( /<\/?notextile>/, '' )
# text.gsub!( /x%x%/, '&#38;' )
# text.gsub!( /<br \/>/, "<br />\n" )
text.strip!
text
end
def pgl( text )
GLYPHS.each do |re, resub|
text.gsub! re, resub
end
end
def pba( text_in, element = "" )
return '' unless text_in
style = []
text = text_in.dup
if element == 'td'
colspan = $1 if text =~ /\\(\d+)/
rowspan = $1 if text =~ /\/(\d+)/
style << "vertical-align:#{ v_align( $& ) };" if text =~ A_VLGN
end
style << "#{ $1 };" if not @filter_styles and
text.sub!( /\{([^}]*)\}/, '' )
lang = $1 if
text.sub!( /\[([^)]+?)\]/, '' )
cls = $1 if
text.sub!( /\(([^()]+?)\)/, '' )
style << "padding-left:#{ $1.length }em;" if
text.sub!( /([(]+)/, '' )
style << "padding-right:#{ $1.length }em;" if text.sub!( /([)]+)/, '' )
style << "text-align:#{ h_align( $& ) };" if text =~ A_HLGN
cls, id = $1, $2 if cls =~ /^(.*?)#(.*)$/
atts = ''
atts << " style=\"#{ style.join }\"" unless style.empty?
atts << " class=\"#{ cls }\"" unless cls.to_s.empty?
atts << " lang=\"#{ lang }\"" if lang
atts << " id=\"#{ id }\"" if id
atts << " colspan=\"#{ colspan }\"" if colspan
atts << " rowspan=\"#{ rowspan }\"" if rowspan
atts
end
def table( text )
text << "\n\n"
text.gsub!( /^(?:table(_?#{S}#{A}#{C})\. ?\n)?^(#{A}#{C}\.? ?\|.*?\|)\n\n/m ) do |matches|
tatts, fullrow = $~[1..2]
tatts = pba( tatts, 'table' )
rows = []
fullrow.
split( /\|$/m ).
delete_if { |x| x.empty? }.
each do |row|
ratts, row = pba( $1, 'tr' ), $2 if row =~ /^(#{A}#{C}\. )(.*)/m
cells = []
row.split( '|' ).each do |cell|
ctyp = 'd'
ctyp = 'h' if cell =~ /^_/
catts = ''
catts, cell = pba( $1, 'td' ), $2 if cell =~ /^(_?#{S}#{A}#{C}\. )(.*)/
unless cell.strip.empty?
cells << "\t\t\t<t#{ ctyp }#{ catts }>#{ cell }</t#{ ctyp }>"
end
end
rows << "\t\t<tr#{ ratts }>\n#{ cells.join( "\n" ) }\n\t\t</tr>"
end
"\t<table#{ tatts }>\n#{ rows.join( "\n" ) }\n\t</table>\n\n"
end
end
def lists( text )
text.gsub!( /^([#*]+?#{C} .*?)$(?![^#*])/m ) do |match|
lines = match.split( /\n/ )
last_line = -1
depth = []
lines.each_with_index do |line, line_id|
if line =~ /^([#*]+)(#{A}#{C}) (.*)$/m
tl,atts,content = $~[1..3]
if depth.last
if depth.last.length > tl.length
(depth.length - 1).downto(0) do |i|
break if depth[i].length == tl.length
lines[line_id - 1] << "\n\t\\end{#{ lT( depth[i] ) }}\n\t"
depth.pop
end
end
if !depth.last.nil? && !tl.length.nil? && depth.last.length == tl.length
lines[line_id - 1] << ''
end
end
unless depth.last == tl
depth << tl
atts = pba( atts )
lines[line_id] = "\t\\begin{#{ lT(tl) }}\n\t\\item #{ content }"
else
lines[line_id] = "\t\t\\item #{ content }"
end
last_line = line_id
elsif line =~ /^\s+\S/
last_line = line_id
elsif line_id - last_line < 2 and line =~ /^\S/
last_line = line_id
end
if line_id - last_line > 1 or line_id == lines.length - 1
depth.delete_if do |v|
lines[last_line] << "\n\t\\end{#{ lT( v ) }}"
end
end
end
lines.join( "\n" )
end
end
def lT( text )
text =~ /\#$/ ? 'enumerate' : 'itemize'
end
def fold( text )
text.gsub!( /(.+)\n(?![#*\s|])/, "\\1\\\\\\\\" )
# text.gsub!( /(.+)\n(?![#*\s|])/, "\\1#{ @fold_lines ? ' ' : '<br />' }" )
end
def block( text )
pre = false
find = ['bq','h[1-6]','fn\d+']
regexp_cue = []
lines = text.split( /\n/ ) + [' ']
new_text =
lines.collect do |line|
pre = true if line =~ /<(pre|notextile)>/i
find.each do |tag|
line.gsub!( /^(#{ tag })(#{A}#{C})\.(?::(\S+))? (.*)$/ ) do |m|
tag,atts,cite,content = $~[1..4]
atts = pba( atts )
if tag =~ /fn(\d+)/
# tag = 'p';
# atts << " id=\"fn#{ $1 }\""
regexp_cue << [ /footnote\{#{$1}}/, "footnote{#{content}}" ]
content = ""
end
if tag =~ /h([1-6])/
section_type = "sub" * [$1.to_i - 1, 2].min
start = "\t\\#{section_type}section*{"
tend = "}"
end
if tag == "bq"
cite = check_refs( cite )
cite = " cite=\"#{ cite }\"" if cite
start = "\t\\begin{quotation}\n\\noindent {\\em ";
tend = "}\n\t\\end{quotation}";
end
"#{ start }#{ content }#{ tend }"
end unless pre
end
#line.gsub!( /^(?!\t|<\/?pre|<\/?notextile|<\/?code|$| )(.*)/, "\t<p>\\1</p>" )
#line.gsub!( "<br />", "\n" ) if pre
# pre = false if line =~ /<\/(pre|notextile)>/i
line
end.join( "\n" )
text.replace( new_text )
regexp_cue.each { |pair| text.gsub!(pair.first, pair.last) }
end
def span( text )
QTAGS.each do |tt, ht|
ttr = Regexp::quote( tt )
text.gsub!(
/(^|\s|\>|[#{PUNCT}{(\[])
#{ttr}
(#{C})
(?::(\S+?))?
([^\s#{ttr}]+?(?:[^\n]|\n(?!\n))*?)
([#{PUNCT}]*?)
#{ttr}
(?=[\])}]|[#{PUNCT}]+?|<|\s|$)/xm
) do |m|
start,atts,cite,content,tend = $~[1..5]
atts = pba( atts )
atts << " cite=\"#{ cite }\"" if cite
"#{ start }{\\#{ ht } #{ content }#{ tend }}"
end
end
end
def links( text )
text.gsub!( /
([\s\[{(]|[#{PUNCT}])? # $pre
" # start
(#{C}) # $atts
([^"]+?) # $text
\s?
(?:\(([^)]+?)\)(?="))? # $title
":
(\S+?) # $url
(\/)? # $slash
([^\w\/;]*?) # $post
(?=\s|$)
/x ) do |m|
pre,atts,text,title,url,slash,post = $~[1..7]
url.gsub!(/(\\)(.)/, '\2')
url = check_refs( url )
atts = pba( atts )
atts << " title=\"#{ title }\"" if title
atts = shelve( atts ) if atts
"#{ pre }\\textit{#{ text }} \\footnote{\\texttt{\\textless #{ url }#{ slash }" +
"\\textgreater}#{ post }}"
end
end
def get_refs( text )
text.gsub!( /(^|\s)\[(.+?)\]((?:http:\/\/|javascript:|ftp:\/\/|\/)\S+?)(?=\s|$)/ ) do |m|
flag, url = $~[1..2]
@urlrefs[flag] = url
end
end
def check_refs( text )
@urlrefs[text] || text
end
def image( text )
text.gsub!( /
\! # opening
(\<|\=|\>)? # optional alignment atts
(#{C}) # optional style,class atts
(?:\. )? # optional dot-space
([^\s(!]+?) # presume this is the src
\s? # optional space
(?:\(((?:[^\(\)]|\([^\)]+\))+?)\))? # optional title
\! # closing
(?::#{ HYPERLINK })? # optional href
/x ) do |m|
algn,atts,url,title,href,href_a1,href_a2 = $~[1..7]
atts = pba( atts )
atts << " align=\"#{ i_align( algn ) }\"" if algn
atts << " title=\"#{ title }\"" if title
atts << " alt=\"#{ title }\""
# size = @getimagesize($url);
# if($size) $atts.= " $size[3]";
href = check_refs( href ) if href
url = check_refs( url )
out = ''
out << "<a href=\"#{ href }\">" if href
out << "<img src=\"#{ url }\"#{ atts } />"
out << "</a>#{ href_a1 }#{ href_a2 }" if href
out
end
end
def code( text )
text.gsub!( /
(?:^|([\s\(\[{])) # 1 open bracket?
@ # opening
(?:\|(\w+?)\|)? # 2 language
(\S(?:[^\n]|\n(?!\n))*?) # 3 code
@ # closing
(?:$|([\]})])|
(?=[#{PUNCT}]{1,2}|
\s)) # 4 closing bracket?
/x ) do |m|
before,lang,code,after = $~[1..4]
lang = " language=\"#{ lang }\"" if lang
"#{ before }<code#{ lang }>#{ code }</code>#{ after }"
end
end
def shelve( val )
@shelf << val
" <#{ @shelf.length }>"
end
def retrieve( text )
@shelf.each_with_index do |r, i|
text.gsub!( " <#{ i + 1 }>", r )
end
end
def incoming_entities( text )
## turn any incoming ampersands into a dummy character for now.
## This uses a negative lookahead for alphanumerics followed by a semicolon,
## implying an incoming html entity, to be skipped
text.gsub!( /&(?![#a-z0-9]+;)/i, "x%x%" )
end
def encode_entities( text )
## Convert high and low ascii to entities.
# if $-K == "UTF-8"
# encode_high( text )
# else
text.texesc!( :NoQuotes )
# end
end
def fix_entities( text )
## de-entify any remaining angle brackets or ampersands
text.gsub!( "\&", "&" )
text.gsub!( "\%", "%" )
end
def clean_white_space( text )
text.gsub!( /\r\n/, "\n" )
text.gsub!( /\t/, '' )
text.gsub!( /\n{3,}/, "\n\n" )
text.gsub!( /\n *\n/, "\n\n" )
text.gsub!( /"$/, "\" " )
end
def no_textile( text )
text.gsub!( /(^|\s)==(.*?)==(\s|$)?/,
'\1<notextile>\2</notextile>\3' )
end
def footnote_ref( text )
text.gsub!( /\[([0-9]+?)\](\s)?/,
'\footnote{\1}\2')
#'<sup><a href="#fn\1">\1</a></sup>\2' )
end
def inline( text )
image text
links text
code text
span text
end
def glyphs_deep( text )
codepre = 0
offtags = /(?:code|pre|kbd|notextile)/
if text !~ /<.*>/
# pgl text
footnote_ref text
else
used_offtags = {}
text.gsub!( /(?:[^<].*?(?=<[^\n]*?>|$)|<[^\n]*?>+)/m ) do |line|
tagline = ( line =~ /^<.*>/ )
## matches are off if we're between <code>, <pre> etc.
if tagline
if line =~ /<(#{ offtags })>/i
codepre += 1
used_offtags[$1] = true
line.texesc!( :NoQuotes ) if codepre - used_offtags.length > 0
elsif line =~ /<\/(#{ offtags })>/i
line.texesc!( :NoQuotes ) if codepre - used_offtags.length > 0
codepre -= 1 unless codepre.zero?
used_offtags = {} if codepre.zero?
elsif @filter_html or codepre > 0
line.texesc!( :NoQuotes )
## line.gsub!( /&lt;(\/?#{ offtags })&gt;/, '<\1>' )
end
## do htmlspecial if between <code>
elsif codepre > 0
line.texesc!( :NoQuotes )
## line.gsub!( /&lt;(\/?#{ offtags })&gt;/, '<\1>' )
elsif not tagline
inline line
glyphs_deep line
end
line
end
end
end
def glyphs( text )
text.gsub!( /"\z/, "\" " )
## if no html, do a simple search and replace...
if text !~ /<.*>/
inline text
end
glyphs_deep text
end
def i_align( text )
I_ALGN_VALS[text]
end
def h_align( text )
H_ALGN_VALS[text]
end
def v_align( text )
V_ALGN_VALS[text]
end
def encode_high( text )
## mb_encode_numericentity($text, $cmap, $charset);
end
def decode_high( text )
## mb_decode_numericentity($text, $cmap, $charset);
end
def textile_popup_help( name, helpvar, windowW, windowH )
' <a target="_blank" href="http://www.textpattern.com/help/?item=' + helpvar + '" onclick="window.open(this.href, \'popupwindow\', \'width=' + windowW + ',height=' + windowH + ',scrollbars,resizable\'); return false;">' + name + '</a><br />'
end
CMAP = [
160, 255, 0, 0xffff,
402, 402, 0, 0xffff,
913, 929, 0, 0xffff,
931, 937, 0, 0xffff,
945, 969, 0, 0xffff,
977, 978, 0, 0xffff,
982, 982, 0, 0xffff,
8226, 8226, 0, 0xffff,
8230, 8230, 0, 0xffff,
8242, 8243, 0, 0xffff,
8254, 8254, 0, 0xffff,
8260, 8260, 0, 0xffff,
8465, 8465, 0, 0xffff,
8472, 8472, 0, 0xffff,
8476, 8476, 0, 0xffff,
8482, 8482, 0, 0xffff,
8501, 8501, 0, 0xffff,
8592, 8596, 0, 0xffff,
8629, 8629, 0, 0xffff,
8656, 8660, 0, 0xffff,
8704, 8704, 0, 0xffff,
8706, 8707, 0, 0xffff,
8709, 8709, 0, 0xffff,
8711, 8713, 0, 0xffff,
8715, 8715, 0, 0xffff,
8719, 8719, 0, 0xffff,
8721, 8722, 0, 0xffff,
8727, 8727, 0, 0xffff,
8730, 8730, 0, 0xffff,
8733, 8734, 0, 0xffff,
8736, 8736, 0, 0xffff,
8743, 8747, 0, 0xffff,
8756, 8756, 0, 0xffff,
8764, 8764, 0, 0xffff,
8773, 8773, 0, 0xffff,
8776, 8776, 0, 0xffff,
8800, 8801, 0, 0xffff,
8804, 8805, 0, 0xffff,
8834, 8836, 0, 0xffff,
8838, 8839, 0, 0xffff,
8853, 8853, 0, 0xffff,
8855, 8855, 0, 0xffff,
8869, 8869, 0, 0xffff,
8901, 8901, 0, 0xffff,
8968, 8971, 0, 0xffff,
9001, 9002, 0, 0xffff,
9674, 9674, 0, 0xffff,
9824, 9824, 0, 0xffff,
9827, 9827, 0, 0xffff,
9829, 9830, 0, 0xffff,
338, 339, 0, 0xffff,
352, 353, 0, 0xffff,
376, 376, 0, 0xffff,
710, 710, 0, 0xffff,
732, 732, 0, 0xffff,
8194, 8195, 0, 0xffff,
8201, 8201, 0, 0xffff,
8204, 8207, 0, 0xffff,
8211, 8212, 0, 0xffff,
8216, 8218, 0, 0xffff,
8218, 8218, 0, 0xffff,
8220, 8222, 0, 0xffff,
8224, 8225, 0, 0xffff,
8240, 8240, 0, 0xffff,
8249, 8250, 0, 0xffff,
8364, 8364, 0, 0xffff
]
end

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,11 +1,11 @@
require 'cgi'
require_dependency 'chunks/engines'
require_dependency 'chunks/category'
require 'chunks/engines'
require 'chunks/category'
require_dependency 'chunks/include'
require_dependency 'chunks/wiki'
require_dependency 'chunks/literal'
require_dependency 'chunks/uri'
require_dependency 'chunks/nowiki'
require 'chunks/nowiki'
# Wiki content is just a string that can process itself with a chain of
# actions. The actions can modify wiki content so that certain parts of

View file

@ -336,6 +336,7 @@ font-size:70%;
div.rightHandSide {
border-left:1px dotted #ccc;
border-bottom:1px dotted #ccc;
float:right;
font-size:80%;
margin-left:0.7em;

View file

@ -89,7 +89,6 @@ class FileControllerTest < Test::Unit::TestCase
# updated from post to get - post fails the spam protection (no javascript)
r = get :file, :web => 'wiki1',
:file => {:file_name => 'rails-e2e.gif', :content => StringIO.new(picture)}
assert_redirected_to({})
assert @web.has_file?('rails-e2e.gif')
assert_equal(picture, WikiFile.find_by_file_name('rails-e2e.gif').content)
end

View file

@ -21,7 +21,7 @@ class RoutesTest < Test::Unit::TestCase
:controller => 'wiki',
:action => 'an_action', :id => 'HomePage'
)
assert_recognizes({:controller => 'wiki', :action => 'index'}, '///')
# assert_recognizes({:controller => 'wiki', :action => 'index'}, '///')
end
def test_parse_uri_liberal_with_pagenames
@ -29,13 +29,13 @@ class RoutesTest < Test::Unit::TestCase
assert_routing('web/show/%24HOME_PAGE',
:controller => 'wiki', :web => 'web', :action => 'show', :id => '$HOME_PAGE')
assert_routing('web/show/HomePage%3F',
:controller => 'wiki', :web => 'web', :action => 'show',
:id => 'HomePage')
# assert_routing('web/show/HomePage%3F',
# :controller => 'wiki', :web => 'web', :action => 'show',
# :id => 'HomePage')
assert_routing('web/show/HomePage%3Farg1%3Dvalue1%26arg2%3Dvalue2',
:controller => 'wiki', :web => 'web', :action => 'show',
:id => 'HomePage?arg1=value1&arg2=value2')
# assert_routing('web/show/HomePage%3Farg1%3Dvalue1%26arg2%3Dvalue2',
# :controller => 'wiki', :web => 'web', :action => 'show',
# :id => 'HomePage?arg1=value1&arg2=value2')
assert_routing('web/files/abc.zip',
:web => 'web', :controller => 'file', :action => 'file', :id => 'abc.zip')

View file

@ -32,7 +32,7 @@ class WikiControllerTest < Test::Unit::TestCase
get :authenticate, :web => 'wiki1', :password => 'pswd'
assert_redirected_to :web => 'wiki1', :action => 'show', :id => 'HomePage'
assert_equal ['pswd'], @response.cookies['web_address']
assert_equal ['pswd'], @response.cookies['wiki1']
end
def test_authenticate_wrong_password
@ -159,15 +159,15 @@ class WikiControllerTest < Test::Unit::TestCase
if ENV['INSTIKI_TEST_LATEX'] or defined? $INSTIKI_TEST_PDFLATEX
def test_export_pdf
r = process 'export_pdf', 'web' => 'wiki1'
assert_response(:success, bypass_body_parsing = true)
assert_equal 'application/pdf', r.headers['Content-Type']
assert_match /attachment; filename="wiki1-tex-\d\d\d\d-\d\d-\d\d-\d\d-\d\d-\d\d.pdf"/,
r.headers['Content-Disposition']
assert_equal '%PDF', r.body[0..3]
assert_equal "EOF\n", r.body[-4..-1]
end
# def test_export_pdf
# r = process 'export_pdf', 'web' => 'wiki1'
# assert_response(:success, bypass_body_parsing = true)
# assert_equal 'application/pdf', r.headers['Content-Type']
# assert_match /attachment; filename="wiki1-tex-\d\d\d\d-\d\d-\d\d-\d\d-\d\d-\d\d.pdf"/,
# r.headers['Content-Disposition']
# assert_equal '%PDF', r.body[0..3]
# assert_equal "EOF\n", r.body[-4..-1]
# end
else
puts 'Warning: tests involving pdflatex are very slow, therefore they are disabled by default.'
@ -175,15 +175,15 @@ class WikiControllerTest < Test::Unit::TestCase
puts ' $INSTIKI_TEST_PDFLATEX to enable them.'
end
def test_export_tex
r = process 'export_tex', 'web' => 'wiki1'
assert_response(:success, bypass_body_parsing = true)
assert_equal 'application/octet-stream', r.headers['Content-Type']
assert_match /attachment; filename="wiki1-tex-\d\d\d\d-\d\d-\d\d-\d\d-\d\d-\d\d.tex"/,
r.headers['Content-Disposition']
assert_equal '\documentclass', r.body[0..13], 'Content is not a TeX file'
end
# def test_export_tex
# r = process 'export_tex', 'web' => 'wiki1'
#
# assert_response(:success, bypass_body_parsing = true)
# assert_equal 'application/octet-stream', r.headers['Content-Type']
# assert_match /attachment; filename="wiki1-tex-\d\d\d\d-\d\d-\d\d-\d\d-\d\d-\d\d.tex"/,
# r.headers['Content-Disposition']
# assert_equal '\documentclass', r.body[0..13], 'Content is not a TeX file'
# end
def test_feeds
process('feeds', 'web' => 'wiki1')
@ -251,18 +251,18 @@ class WikiControllerTest < Test::Unit::TestCase
if ENV['INSTIKI_TEST_LATEX'] or defined? $INSTIKI_TEST_PDFLATEX
def test_pdf
assert RedClothForTex.available?, 'Cannot do test_pdf when pdflatex is not available'
r = process('pdf', 'web' => 'wiki1', 'id' => 'HomePage')
assert_response(:success, bypass_body_parsing = true)
assert_equal '%PDF', r.body[0..3]
assert_equal "EOF\n", r.body[-4..-1]
assert_equal 'application/pdf', r.headers['Content-Type']
assert_match /attachment; filename="HomePage-wiki1-\d\d\d\d-\d\d-\d\d-\d\d-\d\d-\d\d.pdf"/,
r.headers['Content-Disposition']
end
# def test_pdf
# assert RedClothForTex.available?, 'Cannot do test_pdf when pdflatex is not available'
# r = process('pdf', 'web' => 'wiki1', 'id' => 'HomePage')
# assert_response(:success, bypass_body_parsing = true)
#
# assert_equal '%PDF', r.body[0..3]
# assert_equal "EOF\n", r.body[-4..-1]
#
# assert_equal 'application/pdf', r.headers['Content-Type']
# assert_match /attachment; filename="HomePage-wiki1-\d\d\d\d-\d\d-\d\d-\d\d-\d\d-\d\d.pdf"/,
# r.headers['Content-Disposition']
# end
end
@ -387,8 +387,8 @@ class WikiControllerTest < Test::Unit::TestCase
assert_equal @home.revisions[0], r.template_objects['revision']
end
def test_rss_with_content
r = process 'rss_with_content', 'web' => 'wiki1'
def test_atom_with_content
r = process 'atom_with_content', 'web' => 'wiki1'
assert_response(:success)
pages = r.template_objects['pages_by_revision']
@ -397,24 +397,24 @@ class WikiControllerTest < Test::Unit::TestCase
assert !r.template_objects['hide_description']
end
def test_rss_with_content_when_blocked
def test_atom_with_content_when_blocked
@web.update_attributes(:password => 'aaa', :published => false)
@web = Web.find(@web.id)
r = process 'rss_with_content', 'web' => 'wiki1'
r = process 'atom_with_content', 'web' => 'wiki1'
assert_equal 403, r.response_code
end
def test_rss_with_headlines
def test_atom_with_headlines
@title_with_spaces = @wiki.write_page('wiki1', 'Title With Spaces',
'About spaces', 1.hour.ago, Author.new('TreeHugger', '127.0.0.2'), test_renderer)
@request.host = 'localhost'
@request.port = 8080
r = process 'rss_with_headlines', 'web' => 'wiki1'
r = process 'atom_with_headlines', 'web' => 'wiki1'
assert_response(:success)
pages = r.template_objects['pages_by_revision']
@ -435,20 +435,25 @@ class WikiControllerTest < Test::Unit::TestCase
'http://localhost:8080/wiki1/show/HomePage',
]
assert_template_xpath_match '/rss/channel/link',
'http://localhost:8080/wiki1/show/HomePage'
assert_template_xpath_match '/rss/channel/item/guid', expected_page_links
assert_template_xpath_match '/rss/channel/item/link', expected_page_links
assert_tag :tag => 'link',
:parent => {:tag => 'feed'},
:attributes => { :rel => 'alternate',
:href => 'http://localhost:8080/wiki1/show/HomePage'}
expected_page_links.each do |link|
assert_tag :tag => 'link',
:parent => {:tag => 'entry'},
:attributes => {:href => link }
end
end
def test_rss_switch_links_to_published
def test_atom_switch_links_to_published
@web.update_attributes(:password => 'aaa', :published => true)
@web = Web.find(@web.id)
@request.host = 'foo.bar.info'
@request.port = 80
r = process 'rss_with_headlines', 'web' => 'wiki1'
r = process 'atom_with_headlines', 'web' => 'wiki1'
assert_response(:success)
xml = REXML::Document.new(r.body)
@ -463,69 +468,76 @@ class WikiControllerTest < Test::Unit::TestCase
'http://foo.bar.info/wiki1/published/FirstPage',
'http://foo.bar.info/wiki1/published/HomePage']
assert_template_xpath_match '/rss/channel/link',
'http://foo.bar.info/wiki1/published/HomePage'
assert_template_xpath_match '/rss/channel/item/guid', expected_page_links
assert_template_xpath_match '/rss/channel/item/link', expected_page_links
assert_tag :tag => 'link',
:parent =>{:tag =>'feed'},
:attributes => {:rel => 'alternate',
:href => 'http://foo.bar.info/wiki1/published/HomePage'}
expected_page_links.each do |link|
assert_tag :tag => 'link',
:parent => {:tag => 'entry'},
:attributes => {:href => link}
end
end
def test_rss_with_params
setup_wiki_with_30_pages
# def test_atom_with_params
# setup_wiki_with_30_pages
#
# r = process 'atom_with_headlines', 'web' => 'wiki1'
# assert_response(:success)
# pages = r.template_objects['pages_by_revision']
# assert_equal 15, pages.size, 15
#
# r = process 'atom_with_headlines', 'web' => 'wiki1', 'limit' => '5'
# assert_response(:success)
# pages = r.template_objects['pages_by_revision']
# assert_equal 5, pages.size
#
# r = process 'atom_with_headlines', 'web' => 'wiki1', 'limit' => '25'
# assert_response(:success)
# pages = r.template_objects['pages_by_revision']
# assert_equal 25, pages.size
#
# r = process 'atom_with_headlines', 'web' => 'wiki1', 'limit' => 'all'
# assert_response(:success)
# pages = r.template_objects['pages_by_revision']
# assert_equal 38, pages.size
#
# r = process 'atom_with_headlines', 'web' => 'wiki1', 'start' => '1976-10-16'
# assert_response(:success)
# pages = r.template_objects['pages_by_revision']
# assert_equal 23, pages.size
#
# r = process 'atom_with_headlines', 'web' => 'wiki1', 'end' => '1976-10-16'
# assert_response(:success)
# pages = r.template_objects['pages_by_revision']
# assert_equal 15, pages.size
#
# r = process 'atom_with_headlines', 'web' => 'wiki1', 'start' => '1976-10-01', 'end' => '1976-10-06'
# assert_response(:success)
# pages = r.template_objects['pages_by_revision']
# assert_equal 5, pages.size
# end
r = process 'rss_with_headlines', 'web' => 'wiki1'
assert_response(:success)
pages = r.template_objects['pages_by_revision']
assert_equal 15, pages.size, 15
r = process 'rss_with_headlines', 'web' => 'wiki1', 'limit' => '5'
assert_response(:success)
pages = r.template_objects['pages_by_revision']
assert_equal 5, pages.size
r = process 'rss_with_headlines', 'web' => 'wiki1', 'limit' => '25'
assert_response(:success)
pages = r.template_objects['pages_by_revision']
assert_equal 25, pages.size
r = process 'rss_with_headlines', 'web' => 'wiki1', 'limit' => 'all'
assert_response(:success)
pages = r.template_objects['pages_by_revision']
assert_equal 38, pages.size
r = process 'rss_with_headlines', 'web' => 'wiki1', 'start' => '1976-10-16'
assert_response(:success)
pages = r.template_objects['pages_by_revision']
assert_equal 23, pages.size
r = process 'rss_with_headlines', 'web' => 'wiki1', 'end' => '1976-10-16'
assert_response(:success)
pages = r.template_objects['pages_by_revision']
assert_equal 15, pages.size
r = process 'rss_with_headlines', 'web' => 'wiki1', 'start' => '1976-10-01', 'end' => '1976-10-06'
assert_response(:success)
pages = r.template_objects['pages_by_revision']
assert_equal 5, pages.size
end
def test_rss_title_with_ampersand
def test_atom_title_with_ampersand
# was ticket:143
@wiki.write_page('wiki1', 'Title&With&Ampersands',
'About spaces', 1.hour.ago, Author.new('NitPicker', '127.0.0.3'), test_renderer)
r = process 'rss_with_headlines', 'web' => 'wiki1'
r = process 'atom_with_headlines', 'web' => 'wiki1'
assert r.body.include?('<title>Home Page</title>')
assert r.body.include?('<title>Title&amp;With&amp;Ampersands</title>')
assert r.body.include?('<title type="html">Home Page</title>')
assert r.body.include?('<title type="html">Title&amp;With&amp;Ampersands</title>')
end
def test_rss_timestamp
def test_atom_timestamp
new_page = @wiki.write_page('wiki1', 'PageCreatedAtTheBeginningOfCtime',
'Created on 1 Jan 1970 at 0:00:00 Z', Time.at(0), Author.new('NitPicker', '127.0.0.3'),
test_renderer)
r = process 'rss_with_headlines', 'web' => 'wiki1'
assert_template_xpath_match '/rss/channel/item/pubDate[9]', "Thu, 01 Jan 1970 00:00:00 Z"
r = process 'atom_with_headlines', 'web' => 'wiki1'
assert_tag :tag =>'published',
:parent => {:tag => 'entry'},
:content => '2004-04-04T21:50:00Z'
end
def test_save
@ -565,7 +577,7 @@ class WikiControllerTest < Test::Unit::TestCase
'author' => 'SomeOtherAuthor'}, {:return_to => '/wiki1/show/HomePage'}
assert_redirected_to :action => 'edit', :web => 'wiki1', :id => 'HomePage'
assert(@response.has_key(:error))
# assert(@response.has_key(:error))
assert r.flash[:error].kind_of?(Instiki::ValidationError)
revisions_after = @home.revisions.size
@ -653,14 +665,14 @@ class WikiControllerTest < Test::Unit::TestCase
r = process('tex', 'web' => 'wiki1', 'id' => 'HomePage')
assert_response(:success)
assert_equal "\\documentclass[12pt,titlepage]{article}\n\n\\usepackage[danish]{babel} " +
"%danske tekster\n\\usepackage[OT1]{fontenc} %rigtige danske bogstaver...\n" +
"\\usepackage{a4}\n\\usepackage{graphicx}\n\\usepackage{ucs}\n\\usepackage[utf8x]" +
"{inputenc}\n\\input epsf \n\n%----------------------------------------------------" +
"---------------\n\n\\begin{document}\n\n\\sloppy\n\n%-----------------------------" +
"--------------------------------------\n\n\\section*{HomePage}\n\nHisWay would be " +
"MyWay in kinda ThatWay in HisWay though MyWay \\OverThere -- see SmartEngine in that " +
"SmartEngineGUI\n\n\\end{document}", r.body
assert_equal "\\documentclass[12pt,titlepage]{article}\n\n\\usepackage{amsmath}" +
"\n\\usepackage{amsfonts}\n\\usepackage{graphicx}\n\\usepackage{ucs}\n" +
"\\usepackage[utf8x]{inputenc}\n\\usepackage{hyperref}\n\n" +
"%-------------------------------------------------------------------\n\n" +
"\\begin{document}\n\n%--------------------------------------------------" +
"-----------------\n\n\\section*{HomePage}\n\nTeX export only supported with" +
" the Markdown text filters.\n\n\\end{document}\n",
r.body
end

View file

@ -11,7 +11,9 @@ class DiffTest < Test::Unit::TestCase
def diff(a,b)
diff_doc = REXML::Document.new
diff_doc << (div = REXML::Element.new 'div' )
div = REXML::Element.new('div', nil, {:respect_whitespace =>:all})
div.attributes['class'] = 'xhtmldiff_wrapper'
diff_doc << div
hd = XHTMLDiff.new(div)
parsed_a = REXML::HashableElementDelegator.new(
REXML::XPath.first(REXML::Document.new("<div>"+a+"</div>"), '/div'))
@ -20,14 +22,14 @@ class DiffTest < Test::Unit::TestCase
Diff::LCS.traverse_balanced(parsed_a, parsed_b, hd)
diffs = ''
diff_doc.write(diffs, -1, true, true)
diffs
diffs.gsub(/\A<div class='xhtmldiff_wrapper'>(.*)<\/div>\Z/m, '\1')
end
def test_html_diff_simple
a = 'this was the original string'
b = 'this is the new string'
assert_equal("<div><span> this<del class='diffmod'> was</del><ins class='diffmod'> is</ins> the" +
"<del class='diffmod'> original</del><ins class='diffmod'> new</ins> string</span></div>",
assert_equal("<span> this<del class='diffmod'> was</del><ins class='diffmod'> is</ins> the" +
"<del class='diffmod'> original</del><ins class='diffmod'> new</ins> string</span>",
diff(a, b))
end
@ -35,10 +37,10 @@ class DiffTest < Test::Unit::TestCase
a = "<p>this was the original string</p>"
b = "<p>this is</p>\n<p> the new string</p>\n<p>around the world</p>"
assert_equal(
"<div><p><span> this<del class='diffmod'> was</del><ins class='diffmod'> is</ins>" +
"<p><span> this<del class='diffmod'> was</del><ins class='diffmod'> is</ins>" +
"<del class='diffdel'> the</del><del class='diffdel'> original</del><del class='diffdel'> string</del></span></p>" +
"<ins class='diffins'>\n</ins><ins class='diffins'><p> the new string</p></ins>" +
"<ins class='diffins'>\n</ins><ins class='diffins'><p>around the world</p></ins></div>",
"<ins class='diffins'>\n</ins><ins class='diffins'><p>around the world</p></ins>",
diff(a, b))
end
@ -46,8 +48,8 @@ class DiffTest < Test::Unit::TestCase
a = "<p>this is a paragraph</p>\n<p>this is a second paragraph</p>\n<p>this is a third paragraph</p>"
b = "<p>this is a paragraph</p>\n<p>this is a third paragraph</p>"
assert_equal(
"<div><p>this is a paragraph</p>\n<del class='diffdel'><p>this is a second paragraph</p></del>" +
"<del class='diffdel'>\n</del><p>this is a third paragraph</p></div>",
"<p>this is a paragraph</p>\n<del class='diffdel'><p>this is a second paragraph</p></del>" +
"<del class='diffdel'>\n</del><p>this is a third paragraph</p>",
diff(a, b))
end
@ -55,8 +57,8 @@ class DiffTest < Test::Unit::TestCase
a = "<p>foo bar</p>"
b = "<p>foo</p><p>bar</p>"
assert_equal(
"<div><p><span> foo<del class='diffdel'> bar</del></span></p>" +
"<ins class='diffins'><p>bar</p></ins></div>",
"<p><span> foo<del class='diffdel'> bar</del></span></p>" +
"<ins class='diffins'><p>bar</p></ins>",
diff(a,b))
end
@ -64,8 +66,8 @@ class DiffTest < Test::Unit::TestCase
a = "<p>foo</p><p>bar</p>"
b = "<p>foo bar</p>"
assert_equal(
"<div><p><span> foo<ins class='diffins'> bar</ins></span></p>" +
"<del class='diffdel'><p>bar</p></del></div>",
"<p><span> foo<ins class='diffins'> bar</ins></span></p>" +
"<del class='diffdel'><p>bar</p></del>",
diff(a,b))
end
@ -73,31 +75,31 @@ class DiffTest < Test::Unit::TestCase
a = "<p>foo bar</p>"
b = "<p>foo <b>bar</b></p>"
assert_equal(
"<div><p><span> foo<del class='diffdel'> bar</del></span>" +
"<ins class='diffins'><b>bar</b></ins></p></div>",
"<p><span> foo<del class='diffdel'> bar</del></span>" +
"<ins class='diffins'><b>bar</b></ins></p>",
diff(a,b))
end
def test_html_diff_with_tags
a = ""
b = "<div>foo</div>"
assert_equal "<ins class='diffins'><div>foo</div></ins>", diff(a, b)
end
# FIXME this test fails (ticket #67, http://dev.instiki.org/ticket/67)
def test_html_diff_preserves_endlines_in_pre
a = "<pre>a\nb\nc\n</pre>"
b = "<pre>a\n</pre>"
assert_equal(
"<div><pre><span> a\n<del class='diffdel'>b\nc\n</del></span></pre></div>",
"<pre><span> a\n<del class='diffdel'>b\nc\n</del></span></pre>",
diff(a, b))
end
def test_html_diff_with_tags
a = ""
b = "<div>foo</div>"
assert_equal "<div><ins class='diffins'><div>foo</div></ins></div>", diff(a, b)
end
# FIXME. xhtmldiff fails to detect any change here
def test_diff_for_tag_change
a = "<a>x</a>"
b = "<b>x</b>"
# FIXME. xhtmldiff fails to detect any change here
assert_equal "<div><del class='diffdel'><a>x</a></del><ins class='diffins'><b>x</b></ins></div>", diff(a, b)
assert_equal "<del class='diffdel'><a>x</a></del><ins class='diffins'><b>x</b></ins>", diff(a, b)
end
end

View file

@ -1,18 +1,17 @@
#!/usr/bin/env ruby
require File.dirname(__FILE__) + '/../test_helper'
require 'redcloth_for_tex'
class RedClothForTexTest < Test::Unit::TestCase
def test_basics
assert_equal '{\bf First Page}', RedClothForTex.new("*First Page*").to_tex
assert_equal '{\em First Page}', RedClothForTex.new("_First Page_").to_tex
assert_equal "\\begin{itemize}\n\t\\item A\n\t\t\\item B\n\t\t\\item C\n\t\\end{itemize}", RedClothForTex.new("* A\n* B\n* C").to_tex
assert_equal '{\bf First Page}', Maruku.new('*First Page*').to_latex
assert_equal '{\em First Page}', Maruku.new('_First Page_').to_latex
assert_equal "\\begin{itemize}\n\t\\item A\n\t\t\\item B\n\t\t\\item C\n\t\\end{itemize}", Maruku.new('* A\n* B\n* C').to_latex
end
def test_blocks
assert_equal '\section*{hello}', RedClothForTex.new("h1. hello").to_tex
assert_equal '\subsection*{hello}', RedClothForTex.new("h2. hello").to_tex
assert_equal '\section*{hello}', Maruku.new('#hello#').to_latex
assert_equal '\subsection*{hello}', Maruku.new('##hello##').to_latex
end
def test_table_of_contents

View file

@ -46,7 +46,7 @@ class PageRendererTest < Test::Unit::TestCase
'would be <a class="existingWikiWord" href="../show/MyWay">My Way</a> in kinda ' +
'<a class="existingWikiWord" href="../show/ThatWay">That Way</a> in ' +
'<span class="newWikiWord">His Way<a href="../show/HisWay">?</a></span> ' +
'though <a class="existingWikiWord" href="../show/MyWay">My Way</a> OverThere&#8212;see ' +
%{though <a class="existingWikiWord" href="../show/MyWay">My Way</a> OverThere—see } +
'<a class="existingWikiWord" href="../show/SmartEngine">Smart Engine</a> in that ' +
'<span class="newWikiWord">Smart Engine GUI' +
'<a href="../show/SmartEngineGUI">?</a></span></p>',
@ -57,10 +57,15 @@ class PageRendererTest < Test::Unit::TestCase
set_web_property :markup, :markdown
assert_markup_parsed_as(
%{<h1>My Headline</h1>\n\n<p>that <span class="newWikiWord">} +
%{<h1 id="my_headline">My Headline</h1>\n\n<p>that <span class="newWikiWord">} +
%{Smart Engine GUI<a href="../show/SmartEngineGUI">?</a></span></p>},
"My Headline\n===========\n\nthat SmartEngineGUI")
assert_markup_parsed_as(
%{<h1 id="my_headline">My Headline</h1>\n\n<p>that <span class="newWikiWord">} +
%{Smart Engine GUI<a href="../show/SmartEngineGUI">?</a></span></p>},
"#My Headline#\n\nthat SmartEngineGUI")
code_block = [
'This is a code block:',
'',
@ -72,7 +77,7 @@ class PageRendererTest < Test::Unit::TestCase
assert_markup_parsed_as(
%{<p>This is a code block:</p>\n\n<pre><code>def a_method(arg)\n} +
%{return ThatWay\n</code></pre>\n\n<p>Nice!</p>},
%{return ThatWay</code></pre>\n\n<p>Nice!</p>},
code_block)
end
@ -100,15 +105,15 @@ class PageRendererTest < Test::Unit::TestCase
set_web_property :markup, :markdown
assert_markup_parsed_as(
"<h1>Markdown heading</h1>\n\n" +
"<h1 id=\"markdown_heading\">Markdown heading</h1>\n\n" +
"<p>h2. Textile heading</p>\n\n" +
"<p><em>some</em> <strong>text</strong> <em>with</em> -styles-</p>\n\n" +
"<ul>\n<li>list 1</li>\n<li>list 2</li>\n</ul>",
"<ul>\n<li>list 1</li>\n\n<li>list 2</li>\n</ul>",
textile_and_markdown)
set_web_property :markup, :textile
assert_markup_parsed_as(
"<p>Markdown heading<br />================</p>\n\n\n\t<h2>Textile heading</h2>" +
"<p>Markdown heading<br/>================</p>\n\n\n\t<h2>Textile heading</h2>" +
"\n\n\n\t<p><strong>some</strong> <b>text</b> <em>with</em> <del>styles</del></p>" +
"\n\n\n\t<ul>\n\t<li>list 1</li>\n\t\t<li>list 2</li>\n\t</ul>",
textile_and_markdown)
@ -159,14 +164,14 @@ class PageRendererTest < Test::Unit::TestCase
# wikiwords are invalid as styles, must be in "name: value" form
def test_content_with_wikiword_in_style_tag
assert_markup_parsed_as(
'<p>That is some <em style="">Stylish Emphasis</em></p>',
"<p>That is some <em style=''>Stylish Emphasis</em></p>",
'That is some <em style="WikiWord">Stylish Emphasis</em>')
end
# validates format of style..
def test_content_with_valid_style_in_style_tag
assert_markup_parsed_as(
'<p>That is some <em style="text-align: right;">Stylish Emphasis</em></p>',
"<p>That is some <em style='text-align: right;'>Stylish Emphasis</em></p>",
'That is some <em style="text-align: right">Stylish Emphasis</em>')
end
@ -177,37 +182,37 @@ class PageRendererTest < Test::Unit::TestCase
def test_content_with_pre_blocks
assert_markup_parsed_as(
'<p>A <code>class SmartEngine end</code> would not mark up <pre>CodeBlocks</pre></p>',
'<p>A <code>class SmartEngine end</code> would not mark up </p><pre>CodeBlocks</pre>',
'A <code>class SmartEngine end</code> would not mark up <pre>CodeBlocks</pre>')
end
def test_content_with_autolink_in_parentheses
assert_markup_parsed_as(
'<p>The <span class="caps">W3C</span> body (<a href="http://www.w3c.org">' +
'<p>The <span class=\'caps\'>W3C</span> body (<a href="http://www.w3c.org">' +
'http://www.w3c.org</a>) sets web standards</p>',
'The W3C body (http://www.w3c.org) sets web standards')
end
def test_content_with_link_in_parentheses
assert_markup_parsed_as(
'<p>(<a href="http://wiki.org/wiki.cgi?WhatIsWiki">What is a wiki?</a>)</p>',
"<p>(<a href='http://wiki.org/wiki.cgi?WhatIsWiki'>What is a wiki?</a>)</p>",
'("What is a wiki?":http://wiki.org/wiki.cgi?WhatIsWiki)')
end
def test_content_with_image_link
assert_markup_parsed_as(
'<p>This <img src="http://hobix.com/sample.jpg" alt="" /> is a Textile image link.</p>',
"<p>This <img src='http://hobix.com/sample.jpg' alt=''/> is a Textile image link.</p>",
'This !http://hobix.com/sample.jpg! is a Textile image link.')
end
def test_content_with_inlined_img_tag
assert_markup_parsed_as(
'<p>This <img src="http://hobix.com/sample.jpg" alt="" /> is an inline image link.</p>',
"<p>This <img src='http://hobix.com/sample.jpg' alt=''/> is an inline image link.</p>",
'This <img src="http://hobix.com/sample.jpg" alt="" /> is an inline image link.')
# currently, upper case HTML elements are not allowed
assert_markup_parsed_as(
'<p>This &lt;IMG SRC="http://hobix.com/sample.jpg" alt=""> is an inline image link.</p>',
'<p>This &lt;IMG SRC="http://hobix.com/sample.jpg" alt=""&gt; is an inline image link.</p>',
'This <IMG SRC="http://hobix.com/sample.jpg" alt=""> is an inline image link.')
end
@ -239,7 +244,7 @@ class PageRendererTest < Test::Unit::TestCase
'<a class="existingWikiWord" href="MyWay.html">My Way</a> in kinda ' +
'<a class="existingWikiWord" href="ThatWay.html">That Way</a> in ' +
'<span class="newWikiWord">His Way</span> though ' +
'<a class="existingWikiWord" href="MyWay.html">My Way</a> OverThere&#8212;see ' +
%{<a class="existingWikiWord" href="MyWay.html">My Way</a> OverThere—see } +
'<a class="existingWikiWord" href="SmartEngine.html">Smart Engine</a> in that ' +
'<span class="newWikiWord">Smart Engine GUI</span></p>',
test_renderer(@revision).display_content_for_export
@ -254,7 +259,7 @@ class PageRendererTest < Test::Unit::TestCase
test_renderer(@revision).display_content
@revision.content = "f\r\nVersionHistory\r\n\r\ncry VersionHistory"
assert_equal "<p>f<br /><span class=\"newWikiWord\">Version History" +
assert_equal "<p>f<br/><span class=\"newWikiWord\">Version History" +
"<a href=\"../show/VersionHistory\">?</a></span></p>\n\n\n\t<p>cry " +
"<span class=\"newWikiWord\">Version History<a href=\"../show/VersionHistory\">?</a>" +
"</span></p>",
@ -274,8 +279,8 @@ class PageRendererTest < Test::Unit::TestCase
Revision.create(:page => @page, :content => 'What a red and lovely morning today',
:author => Author.new('DavidHeinemeierHansson'), :revised_at => Time.now)
assert_equal "<p>What a <del class=\"diffmod\">blue</del><ins class=\"diffmod\">red" +
"</ins> and lovely morning<ins class=\"diffins\"> today</ins></p>", test_renderer(@page.revisions.last).display_diff
assert_equal "<p><span> What a<del class='diffmod'> blue</del><ins class='diffmod'> red" +
"</ins> and lovely morning<ins class='diffins'> today</ins></span></p>", test_renderer(@page.revisions.last).display_diff
end
def test_link_to_file
@ -321,14 +326,14 @@ class PageRendererTest < Test::Unit::TestCase
EOL
assert_markup_parsed_as(
"<ul>\n\t<li><a href=\"~b\">a</a></li>\n\t\t<li>c~ d</li>\n\t</ul>",
"<ul>\n\t<li><a href='~b'>a</a></li>\n\t\t<li>c~ d</li>\n\t</ul>",
list_with_tildas)
end
def test_textile_image_in_mixed_wiki
set_web_property :markup, :mixed
assert_markup_parsed_as(
"<p><img src=\"http://google.com\" alt=\"\" />\nss</p>",
"<p><img src='http://google.com' alt=''/>\nss</p>",
"!http://google.com!\r\nss")
end
@ -395,4 +400,4 @@ class PageRendererTest < Test::Unit::TestCase
test_renderer(page.revisions.last).display_content
end
end
end

View file

@ -40,7 +40,7 @@ class WebTest < Test::Unit::TestCase
assert_equal '123', web.password
# new web should be set for maximum features enabled
assert_equal :textile, web.markup
assert_equal :markdownMML, web.markup
assert_equal '008B26', web.color
assert !web.safe_mode?
assert_equal([], web.pages)

17
vendor/plugins/HTML5lib/LICENSE vendored Normal file
View file

@ -0,0 +1,17 @@
Copyright (c) 2006-2007 The Authors
Contributers:
James Graham - jg307@cam.ac.uk
Anne van Kesteren - annevankesteren@gmail.com
Lachlan Hunt - lachlan.hunt@lachy.id.au
Matt McDonald - kanashii@kanashii.ca
Sam Ruby - rubys@intertwingly.net
Ian Hickson (Google) - ian@hixie.ch
Thomas Broyer - t.broyer@ltgt.net
Jacques Distler - distler@golem.ph.utexas.edu
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View file

@ -148,6 +148,38 @@ module HTML5lib
input
]
CDATA_ELEMENTS = %w[title textarea]
RCDATA_ELEMENTS = %w[
style
script
xmp
iframe
noembed
noframes
noscript
]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],
'img' => %w[ismap],
'audio' => %w[autoplay controls],
'video' => %w[autoplay controls],
'script' => %w[defer async],
'details' => %w[open],
'datagrid' => %w[multiple disabled],
'command' => %w[hidden disabled checked default],
'menu' => %w[autosubmit],
'fieldset' => %w[disabled readonly],
'option' => %w[disabled readonly selected],
'optgroup' => %w[disabled readonly],
'button' => %w[disabled autofocus],
'input' => %w[disabled readonly required autofocus checked ismap],
'select' => %w[disabled readonly autofocus multiple],
'output' => %w[disabled readonly]
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN

View file

@ -0,0 +1 @@
require 'html5lib/filters/optionaltags'

View file

@ -0,0 +1,10 @@
require 'delegate'
require 'enumerator'
module HTML5lib
module Filters
class Base < SimpleDelegator
include Enumerable
end
end
end

View file

@ -0,0 +1,85 @@
require 'html5lib/filters/base'
module HTML5lib
module Filters
class InjectMetaCharset < Base
def initialize(source, encoding)
super(source)
@encoding = encoding
end
def each
state = :pre_head
meta_found = @encoding.nil?
pending = []
__getobj__.each do |token|
case token[:type]
when :StartTag
state = :in_head if token[:name].downcase == "head"
when :EmptyTag
if token[:name].downcase == "meta"
# replace charset with actual encoding
token[:data].each_with_index do |(name,value),index|
if name == 'charset'
token[:data][index][1]=@encoding
meta_found = true
end
end
# replace charset with actual encoding
has_http_equiv_content_type = false
content_index = -1
token[:data].each_with_index do |(name,value),i|
if name.downcase == 'charset'
token[:data][i] = ['charset', @encoding]
meta_found = true
break
elsif name == 'http-equiv' and value.downcase == 'content-type'
has_http_equiv_content_type = true
elsif name == 'content'
content_index = i
end
end
if not meta_found
if has_http_equiv_content_type and content_index >= 0
token[:data][content_index][1] =
'text/html; charset=%s' % @encoding
meta_found = true
end
end
elsif token[:name].downcase == "head" and not meta_found
# insert meta into empty head
yield(:type => :StartTag, :name => "head", :data => token[:data])
yield(:type => :EmptyTag, :name => "meta",
:data => [["charset", @encoding]])
yield(:type => :EndTag, :name => "head")
meta_found = true
next
end
when :EndTag
if token[:name].downcase == "head" and pending.any?
# insert meta into head (if necessary) and flush pending queue
yield pending.shift
yield(:type => :EmptyTag, :name => "meta",
:data => [["charset", @encoding]]) if not meta_found
yield pending.shift while pending.any?
meta_found = true
state = :post_head
end
end
if state == :in_head
pending << token
else
yield token
end
end
end
end
end
end

View file

@ -0,0 +1,199 @@
require 'html5lib/constants'
require 'html5lib/filters/base'
module HTML5lib
module Filters
class OptionalTagFilter < Base
def slider
previous1 = previous2 = nil
__getobj__.each do |token|
yield previous2, previous1, token if previous1 != nil
previous2 = previous1
previous1 = token
end
yield previous2, previous1, nil
end
def each
slider do |previous, token, nexttok|
type = token[:type]
if type == :StartTag
yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
elsif type == :EndTag
yield token unless is_optional_end(token[:name], nexttok)
else
yield token
end
end
end
def is_optional_start(tagname, previous, nexttok)
type = nexttok ? nexttok[:type] : nil
if tagname == 'html'
# An html element's start tag may be omitted if the first thing
# inside the html element is not a space character or a comment.
return ![:Comment, :SpaceCharacters].include?(type)
elsif tagname == 'head'
# A head element's start tag may be omitted if the first thing
# inside the head element is an element.
return type == :StartTag
elsif tagname == 'body'
# A body element's start tag may be omitted if the first thing
# inside the body element is not a space character or a comment,
# except if the first thing inside the body element is a script
# or style element and the node immediately preceding the body
# element is a head element whose end tag has been omitted.
if [:Comment, :SpaceCharacters].include?(type)
return false
elsif type == :StartTag
# XXX: we do not look at the preceding event, so we never omit
# the body element's start tag if it's followed by a script or
# a style element.
return !%w[script style].include?(nexttok[:name])
else
return true
end
elsif tagname == 'colgroup'
# A colgroup element's start tag may be omitted if the first thing
# inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose
# end tag has been omitted.
if type == :StartTag
# XXX: we do not look at the preceding event, so instead we never
# omit the colgroup element's end tag when it is immediately
# followed by another colgroup element. See is_optional_end.
return nexttok[:name] == "col"
else
return false
end
elsif tagname == 'tbody'
# A tbody element's start tag may be omitted if the first thing
# inside the tbody element is a tr element, and if the element is
# not immediately preceeded by a tbody, thead, or tfoot element
# whose end tag has been omitted.
if type == :StartTag
# omit the thead and tfoot elements' end tag when they are
# immediately followed by a tbody element. See is_optional_end.
if previous and previous[:type] == :EndTag and \
%w(tbody thead tfoot).include?(previous[:name])
return false
end
return nexttok[:name] == 'tr'
else
return false
end
end
return false
end
def is_optional_end(tagname, nexttok)
type = nexttok ? nexttok[:type] : nil
if %w[html head body].include?(tagname)
# An html element's end tag may be omitted if the html element
# is not immediately followed by a space character or a comment.
return ![:Comment, :SpaceCharacters].include?(type)
elsif %w[li optgroup option tr].include?(tagname)
# A li element's end tag may be omitted if the li element is
# immediately followed by another li element or if there is
# no more content in the parent element.
# An optgroup element's end tag may be omitted if the optgroup
# element is immediately followed by another optgroup element,
# or if there is no more content in the parent element.
# An option element's end tag may be omitted if the option
# element is immediately followed by another option element,
# or if there is no more content in the parent element.
# A tr element's end tag may be omitted if the tr element is
# immediately followed by another tr element, or if there is
# no more content in the parent element.
if type == :StartTag
return nexttok[:name] == tagname
else
return type == :EndTag || type == nil
end
elsif %w(dt dd).include?(tagname)
# A dt element's end tag may be omitted if the dt element is
# immediately followed by another dt element or a dd element.
# A dd element's end tag may be omitted if the dd element is
# immediately followed by another dd element or a dt element,
# or if there is no more content in the parent element.
if type == :StartTag
return %w(dt dd).include?(nexttok[:name])
elsif tagname == 'dd'
return type == :EndTag || type == nil
else
return false
end
elsif tagname == 'p'
# A p element's end tag may be omitted if the p element is
# immediately followed by an address, blockquote, dl, fieldset,
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
# or ul element, or if there is no more content in the parent
# element.
if type == :StartTag
return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
h6 hr menu ol p pre table ul).include?(nexttok[:name])
else
return type == :EndTag || type == nil
end
elsif tagname == 'colgroup'
# A colgroup element's end tag may be omitted if the colgroup
# element is not immediately followed by a space character or
# a comment.
if [:Comment, :SpaceCharacters].include?(type)
return false
elsif type == :StartTag
# XXX: we also look for an immediately following colgroup
# element. See is_optional_start.
return nexttok[:name] != 'colgroup'
else
return true
end
elsif %w(thead tbody).include? tagname
# A thead element's end tag may be omitted if the thead element
# is immediately followed by a tbody or tfoot element.
# A tbody element's end tag may be omitted if the tbody element
# is immediately followed by a tbody or tfoot element, or if
# there is no more content in the parent element.
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == :StartTag
return %w(tbody tfoot).include?(nexttok[:name])
elsif tagname == 'tbody'
return (type == :EndTag or type == nil)
else
return false
end
elsif tagname == 'tfoot'
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == :StartTag
return nexttok[:name] == 'tbody'
else
return type == :EndTag || type == nil
end
elsif %w(td th).include? tagname
# A td element's end tag may be omitted if the td element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
# A th element's end tag may be omitted if the th element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
if type == :StartTag
return %w(td th).include?(nexttok[:name])
else
return type == :EndTag || type == nil
end
end
return false
end
end
end
end

View file

@ -0,0 +1,15 @@
require 'html5lib/filters/base'
require 'html5lib/sanitizer'
module HTML5lib
module Filters
class HTMLSanitizeFilter < Base
include HTMLSanitizeModule
def each
__getobj__.each do |token|
yield(sanitize_token(token))
end
end
end
end
end

View file

@ -0,0 +1,36 @@
require 'html5lib/constants'
require 'html5lib/filters/base'
module HTML5lib
module Filters
class WhitespaceFilter < Base
SPACE_PRESERVE_ELEMENTS = %w[pre textarea] + RCDATA_ELEMENTS
SPACES = /[#{SPACE_CHARACTERS.join('')}]+/m
def each
preserve = 0
__getobj__.each do |token|
case token[:type]
when :StartTag
if preserve > 0 or SPACE_PRESERVE_ELEMENTS.include?(token[:name])
preserve += 1
end
when :EndTag
preserve -= 1 if preserve > 0
when :SpaceCharacters
next if preserve == 0
when :Characters
token[:data] = token[:data].sub(SPACES,' ') if preserve == 0
end
yield token
end
end
end
end
end

View file

@ -37,13 +37,13 @@ module HTML5lib
# :strict - raise an exception when a parse error is encountered
# :tree - a treebuilder class controlling the type of tree that will be
# returned. Built in treebuilders can be accessed through
# html5lib.treebuilders.getTreeBuilder(treeType)
# HTML5lib::TreeBuilders[treeType]
def initialize(options = {})
@strict = false
@errors = []
@tokenizer = HTMLTokenizer
@tree = TreeBuilders::REXMLTree::TreeBuilder
@tree = TreeBuilders::REXML::TreeBuilder
options.each { |name, value| instance_variable_set("@#{name}", value) }
@ -62,7 +62,8 @@ module HTML5lib
@errors = []
@tokenizer = @tokenizer.class unless Class === @tokenizer
@tokenizer = @tokenizer.new(stream, :encoding => encoding, :parseMeta => innerHTML)
@tokenizer = @tokenizer.new(stream, :encoding => encoding,
:parseMeta => !innerHTML)
if innerHTML
case @innerHTML = container.downcase
@ -99,10 +100,13 @@ module HTML5lib
case token[:type]
when :Characters, :SpaceCharacters, :Comment
@phase.send method, token[:data]
when :StartTag, :Doctype
when :StartTag
@phase.send method, token[:name], token[:data]
when :EndTag
@phase.send method, token[:name]
when :Doctype
@phase.send method, token[:name], token[:publicId],
token[:systemId], token[:correct]
else
parseError(token[:data])
end
@ -147,10 +151,6 @@ module HTML5lib
raise ParseError if @strict
end
# This error is not an error
def atheistParseError
end
# HTML5 specific normalizations to the token stream
def normalizeToken(token)
@ -160,9 +160,7 @@ module HTML5lib
# element. If it matches a void element atheists did the wrong
# thing and if it doesn't it's wrong for everyone.
if VOID_ELEMENTS.include?(token[:name])
atheistParseError
else
unless VOID_ELEMENTS.include?(token[:name])
parseError(_('Solidus (/) incorrectly placed in tag.'))
end
@ -181,7 +179,7 @@ module HTML5lib
end
elsif token[:type] == :EndTag
parseError(_('End tag contains unexpected attributes.')) if token[:data]
parseError(_('End tag contains unexpected attributes.')) unless token[:data].empty?
token[:name] = token[:name].downcase
end

View file

@ -5,7 +5,7 @@ module HTML5lib
handle_start 'html', 'head'
handle_end 'html'
handle_end %w( html head body br ) => 'ImplyHead'
def processEOF
startTagHead('head', {})
@ -28,7 +28,7 @@ module HTML5lib
@parser.phase.processStartTag(name, attributes)
end
def endTagHtml(name)
def endTagImplyHead(name)
startTagHead('head', {})
@parser.phase.processEndTag(name)
end
@ -38,4 +38,4 @@ module HTML5lib
end
end
end
end

View file

@ -5,15 +5,20 @@ module HTML5lib
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
handle_start 'html', 'body', 'form', 'plaintext', 'a', 'button', 'xmp', 'table', 'hr', 'image'
handle_start 'html'
handle_start %w( base link meta script style ) => 'ProcessInHead'
handle_start 'title'
handle_start 'input', 'textarea', 'select', 'isindex', %w( script style ), %w( marquee object )
handle_start 'body', 'form', 'plaintext', 'a', 'button', 'xmp', 'table', 'hr', 'image'
handle_start %w( li dd dt ) => 'ListItem', %w( base link meta title ) => 'FromHead'
handle_start 'input', 'textarea', 'select', 'isindex', %w( marquee object )
handle_start %w( li dd dt ) => 'ListItem'
handle_start %w( address blockquote center dir div dl fieldset listing menu ol p pre ul ) => 'CloseP'
handle_start %w( b big em font i nobr s small strike strong tt u ) => 'Formatting'
handle_start %w( b big em font i s small strike strong tt u ) => 'Formatting'
handle_start 'nobr'
handle_start %w( area basefont bgsound br embed img param spacer wbr ) => 'VoidFormatting'
@ -27,11 +32,15 @@ module HTML5lib
handle_end %w( address blockquote center div dl fieldset listing menu ol pre ul ) => 'Block'
handle_end HEADING_ELEMENTS => 'Heading'
handle_end %w( a b big em font i nobr s small strike strong tt u ) => 'Formatting'
handle_end %w( head frameset select optgroup option table caption colgroup col thead tfoot tbody tr td th ) => 'Misplaced'
handle_end %w( area basefont bgsound br embed hr image img input isindex param spacer wbr frame ) => 'None'
handle_end 'br'
handle_end %w( area basefont bgsound embed hr image img input isindex param spacer wbr frame ) => 'None'
handle_end %w( noframes noscript noembed textarea xmp iframe ) => 'CdataTextAreaXmp'
@ -41,14 +50,14 @@ module HTML5lib
super(parser, tree)
# for special handling of whitespace in <pre>
@processSpaceCharactersPre = false
@processSpaceCharactersDropNewline = false
end
def processSpaceCharactersPre(data)
def processSpaceCharactersDropNewline(data)
#Sometimes (start of <pre> blocks) we want to drop leading newlines
@processSpaceCharactersPre = false
@processSpaceCharactersDropNewline = false
if (data.length > 0 and data[0] == ?\n and
@tree.openElements[-1].name == 'pre' and
%w[pre textarea].include?(@tree.openElements[-1].name) and
not @tree.openElements[-1].hasContent)
data = data[1..-1]
end
@ -56,8 +65,8 @@ module HTML5lib
end
def processSpaceCharacters(data)
if @processSpaceCharactersPre
processSpaceCharactersPre(data)
if @processSpaceCharactersDropNewline
processSpaceCharactersDropNewline(data)
else
super(data)
end
@ -71,11 +80,11 @@ module HTML5lib
@tree.insertText(data)
end
def startTagScriptStyle(name, attributes)
def startTagProcessInHead(name, attributes)
@parser.phases[:inHead].processStartTag(name, attributes)
end
def startTagFromHead(name, attributes)
def startTagTitle(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that belongs in the head. Moved."))
@parser.phases[:inHead].processStartTag(name, attributes)
end
@ -98,7 +107,7 @@ module HTML5lib
def startTagCloseP(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@processSpaceCharactersPre = true if name == 'pre'
@processSpaceCharactersDropNewline = true if name == 'pre'
end
def startTagForm(name, attributes)
@ -118,7 +127,12 @@ module HTML5lib
@tree.openElements.reverse.each_with_index do |node, i|
if stopName.include?(node.name)
(i + 1).times { @tree.openElements.pop }
poppedNodes = (0..i).collect { @tree.openElements.pop }
if i >= 1
@parser.parseError("Missing end tag%s (%s)" % [
(i>1 ? 's' : ''),
poppedNodes.reverse.map {|item| item.name}.join(', ')])
end
break
end
@ -140,15 +154,19 @@ module HTML5lib
def startTagHeading(name, attributes)
endTagP('p') if in_scope?('p')
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
@parser.parseError(_("Unexpected start tag (#{name})."))
remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) }
break
end
end
# Uncomment the following for IE7 behavior:
# HEADING_ELEMENTS.each do |element|
# if in_scope?(element)
# @parser.parseError(_("Unexpected start tag (#{name})."))
#
# remove_open_elements_until do |element|
# HEADING_ELEMENTS.include?(element.name)
# end
#
# break
# end
# end
@tree.insertElement(name, attributes)
end
@ -168,6 +186,12 @@ module HTML5lib
addFormattingElement(name, attributes)
end
def startTagNobr(name, attributes)
@tree.reconstructActiveFormattingElements
processEndTag('nobr') if in_scope?('nobr')
addFormattingElement(name, attributes)
end
def startTagButton(name, attributes)
if in_scope?('button')
@parser.parseError(_('Unexpected start tag (button) implied end tag (button).'))
@ -248,6 +272,7 @@ module HTML5lib
# XXX Form element pointer checking here as well...
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :RCDATA
@processSpaceCharactersDropNewline = true
end
# iframe, noembed noframes, noscript(if scripting enabled)
@ -312,7 +337,7 @@ module HTML5lib
def endTagBlock(name)
#Put us back in the right whitespace handling mode
@processSpaceCharactersPre = false if name == 'pre'
@processSpaceCharactersDropNewline = false if name == 'pre'
@tree.generateImpliedEndTags if in_scope?(name)
@ -494,6 +519,13 @@ module HTML5lib
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagBr(name)
@parser.parseError(_("Unexpected end tag (br). Treated as br element."))
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, {})
@tree.openElements.pop()
end
def endTagNone(name)
# This handles elements with no end tag.
@parser.parseError(_("This tag (#{name}) has no end tag"))
@ -545,4 +577,4 @@ module HTML5lib
end
end
end
end

View file

@ -5,7 +5,9 @@ module HTML5lib
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
handle_end 'head', 'html', %w( title style script )
handle_end 'head'
handle_end %w( html body br ) => 'ImplyAfterHead'
handle_end %w( title style script )
def processEOF
if ['title', 'style', 'script'].include?(name = @tree.openElements[-1].name)
@ -63,7 +65,11 @@ module HTML5lib
def startTagBaseLinkMeta(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
end
def startTagOther(name, attributes)
@ -80,7 +86,7 @@ module HTML5lib
@parser.phase = @parser.phases[:afterHead]
end
def endTagHtml(name)
def endTagImplyAfterHead(name)
anythingElse
@parser.phase.processEndTag(name)
end
@ -117,4 +123,4 @@ module HTML5lib
end
end
end
end

View file

@ -89,10 +89,10 @@ module HTML5lib
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@parser.insertFromTable = true
@tree.insertFromTable = true
# Process the end tag in the "in body" mode
@parser.phases[:inBody].processEndTag(name)
@parser.insertFromTable = false
@tree.insertFromTable = false
end
protected
@ -107,4 +107,4 @@ module HTML5lib
end
end
end
end

View file

@ -17,9 +17,95 @@ module HTML5lib
@tree.insertComment(data, @tree.document)
end
def processDoctype(name, error)
@parser.parseError(_('Erroneous DOCTYPE.')) if error
def processDoctype(name, publicId, systemId, correct)
if name.downcase != 'html' or publicId or systemId
@parser.parseError(_('Erroneous DOCTYPE.'))
end
# XXX need to update DOCTYPE tokens
@tree.insertDoctype(name)
publicId = publicId.to_s.upcase
if name.downcase != 'html'
# XXX quirks mode
else
if ["+//silmaril//dtd html pro v0r11 19970101//en",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
"-//as//dtd html 3.0 aswedit + extensions//en",
"-//ietf//dtd html 2.0 level 1//en",
"-//ietf//dtd html 2.0 level 2//en",
"-//ietf//dtd html 2.0 strict level 1//en",
"-//ietf//dtd html 2.0 strict level 2//en",
"-//ietf//dtd html 2.0 strict//en",
"-//ietf//dtd html 2.0//en",
"-//ietf//dtd html 2.1e//en",
"-//ietf//dtd html 3.0//en",
"-//ietf//dtd html 3.0//en//",
"-//ietf//dtd html 3.2 final//en",
"-//ietf//dtd html 3.2//en",
"-//ietf//dtd html 3//en",
"-//ietf//dtd html level 0//en",
"-//ietf//dtd html level 0//en//2.0",
"-//ietf//dtd html level 1//en",
"-//ietf//dtd html level 1//en//2.0",
"-//ietf//dtd html level 2//en",
"-//ietf//dtd html level 2//en//2.0",
"-//ietf//dtd html level 3//en",
"-//ietf//dtd html level 3//en//3.0",
"-//ietf//dtd html strict level 0//en",
"-//ietf//dtd html strict level 0//en//2.0",
"-//ietf//dtd html strict level 1//en",
"-//ietf//dtd html strict level 1//en//2.0",
"-//ietf//dtd html strict level 2//en",
"-//ietf//dtd html strict level 2//en//2.0",
"-//ietf//dtd html strict level 3//en",
"-//ietf//dtd html strict level 3//en//3.0",
"-//ietf//dtd html strict//en",
"-//ietf//dtd html strict//en//2.0",
"-//ietf//dtd html strict//en//3.0",
"-//ietf//dtd html//en",
"-//ietf//dtd html//en//2.0",
"-//ietf//dtd html//en//3.0",
"-//metrius//dtd metrius presentational//en",
"-//microsoft//dtd internet explorer 2.0 html strict//en",
"-//microsoft//dtd internet explorer 2.0 html//en",
"-//microsoft//dtd internet explorer 2.0 tables//en",
"-//microsoft//dtd internet explorer 3.0 html strict//en",
"-//microsoft//dtd internet explorer 3.0 html//en",
"-//microsoft//dtd internet explorer 3.0 tables//en",
"-//netscape comm. corp.//dtd html//en",
"-//netscape comm. corp.//dtd strict html//en",
"-//o'reilly and associates//dtd html 2.0//en",
"-//o'reilly and associates//dtd html extended 1.0//en",
"-//spyglass//dtd html 2.0 extended//en",
"-//sq//dtd html 2.0 hotmetal + extensions//en",
"-//sun microsystems corp.//dtd hotjava html//en",
"-//sun microsystems corp.//dtd hotjava strict html//en",
"-//w3c//dtd html 3 1995-03-24//en",
"-//w3c//dtd html 3.2 draft//en",
"-//w3c//dtd html 3.2 final//en",
"-//w3c//dtd html 3.2//en",
"-//w3c//dtd html 3.2s draft//en",
"-//w3c//dtd html 4.0 frameset//en",
"-//w3c//dtd html 4.0 transitional//en",
"-//w3c//dtd html experimental 19960712//en",
"-//w3c//dtd html experimental 970421//en",
"-//w3c//dtd w3 html//en",
"-//w3o//dtd w3 html 3.0//en",
"-//w3o//dtd w3 html 3.0//en//",
"-//w3o//dtd w3 html strict 3.0//en//",
"-//webtechs//dtd mozilla html 2.0//en",
"-//webtechs//dtd mozilla html//en",
"-/w3c/dtd html 4.0 transitional/en",
"html"].include?(publicId) or
(systemId == nil and
["-//w3c//dtd html 4.01 frameset//EN",
"-//w3c//dtd html 4.01 transitional//EN"].include?(publicId)) or
(systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")
#XXX quirks mode
end
end
@parser.phase = @parser.phases[:rootElement]
end
@ -46,4 +132,4 @@ module HTML5lib
end
end
end
end

View file

@ -101,7 +101,7 @@ module HTML5lib
@tree.insertComment(data, @tree.openElements[-1])
end
def processDoctype(name, error)
def processDoctype(name, publicId, systemId, correct)
@parser.parseError(_('Unexpected DOCTYPE. Ignored.'))
end
@ -153,4 +153,4 @@ module HTML5lib
end
end
end
end

View file

@ -33,9 +33,6 @@ module HTML5lib
options.each { |name, value| instance_variable_set("@#{name}", value) }
# List of where new lines occur
@new_lines = []
# Raw Stream
@raw_stream = open_stream(source)
@ -55,25 +52,30 @@ module HTML5lib
# Read bytes from stream decoding them into Unicode
uString = @raw_stream.read
unless @char_encoding == 'utf-8'
if @char_encoding == 'windows-1252'
@win1252 = true
elsif @char_encoding != 'utf-8'
begin
require 'iconv'
uString = Iconv.iconv('utf-8', @encoding, uString)[0]
rescue
begin
uString = Iconv.iconv('utf-8', @char_encoding, uString).first
rescue
@win1252 = true
end
rescue LoadError
@win1252 = true
end
end
# Normalize newlines and null characters
uString.gsub!(/\r\n?/, "\n")
uString.gsub!("\x00", [0xFFFD].pack('U'))
# Convert the unicode string into a list to be used as the data stream
@data_stream = uString
@queue = []
# Reset position in the list to read from
reset
@tell = 0
@line = @col = 0
@line_lengths = []
end
# Produces a file object from source.
@ -95,11 +97,13 @@ module HTML5lib
#First look for a BOM
#This will also read past the BOM if present
encoding = detect_bom
#If there is no BOM need to look for meta elements with encoding
#information
if encoding.nil? and @parse_meta
encoding = detect_encoding_meta
end
#Guess with chardet, if avaliable
if encoding.nil? and @chardet
begin
@ -107,17 +111,18 @@ module HTML5lib
require 'UniversalDetector' # gem install chardet
buffer = @raw_stream.read
encoding = UniversalDetector::chardet(buffer)['encoding']
@raw_stream = open_stream(buffer)
seek(buffer, 0)
rescue LoadError
end
end
# If all else fails use the default encoding
if encoding.nil?
encoding = @DEFAULT_ENCODING
end
#Substitute for equivalent encodings:
encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
#Substitute for equivalent encodings
encoding_sub = {'iso-8859-1' => 'windows-1252'}
if encoding_sub.has_key?(encoding.downcase)
encoding = encoding_sub[encoding.downcase]
@ -132,14 +137,13 @@ module HTML5lib
def detect_bom
bom_dict = {
"\xef\xbb\xbf" => 'utf-8',
"\xff\xfe" => 'utf-16-le',
"\xfe\xff" => 'utf-16-be',
"\xff\xfe\x00\x00" => 'utf-32-le',
"\x00\x00\xfe\xff" => 'utf-32-be'
"\xff\xfe" => 'utf-16le',
"\xfe\xff" => 'utf-16be',
"\xff\xfe\x00\x00" => 'utf-32le',
"\x00\x00\xfe\xff" => 'utf-32be'
}
# Go to beginning of file and read in 4 bytes
@raw_stream.seek(0)
string = @raw_stream.read(4)
return nil unless string
@ -156,45 +160,80 @@ module HTML5lib
end
end
#AT - move this to the caller?
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
@raw_stream.seek(encoding ? seek : 0)
seek(string, encoding ? seek : 0)
return encoding
end
# Report the encoding declared by the meta element
def detect_encoding_meta
parser = EncodingParser.new(@raw_stream.read(@NUM_BYTES_META))
@raw_stream.seek(0)
return parser.get_encoding
def seek(buffer, n)
if @raw_stream.respond_to?(:unget)
@raw_stream.unget(buffer[n..-1])
return
end
if @raw_stream.respond_to?(:seek)
begin
@raw_stream.seek(n)
return
rescue Errno::ESPIPE
end
end
require 'delegate'
@raw_stream = SimpleDelegator.new(@raw_stream)
class << @raw_stream
def read(chars=-1)
if chars == -1 or chars > @data.length
result = @data
@data = ''
return result if __getobj__.eof?
return result + __getobj__.read if chars == -1
return result + __getobj__.read(chars-result.length)
elsif @data.empty?
return __getobj__.read(chars)
else
result = @data[1...chars]
@data = @data[chars..-1]
return result
end
end
def unget(data)
if !@data or @data.empty?
@data = data
else
@data += data
end
end
end
@raw_stream.unget(buffer[n .. -1])
end
def determine_new_lines
# Looks through the stream to find where new lines occur so
# the position method can tell where it is.
@new_lines.push(0)
(0...@data_stream.length).each { |i| @new_lines.push(i) if @data_stream[i] == ?\n }
# Report the encoding declared by the meta element
def detect_encoding_meta
buffer = @raw_stream.read(@NUM_BYTES_META)
parser = EncodingParser.new(buffer)
seek(buffer, 0)
return parser.get_encoding
end
# Returns (line, col) of the current position in the stream.
def position
# Generate list of new lines first time around
determine_new_lines if @new_lines.empty?
line = 0
tell = @tell
@new_lines.each do |pos|
break unless pos < tell
line += 1
line, col = @line, @col
@queue.reverse.each do |c|
if c == "\n"
line -= 1
raise RuntimeError.new("col=#{col}") unless col == 0
col = @line_lengths[line]
else
col -= 1
end
end
col = tell - @new_lines[line-1] - 1
return [line, col]
end
# Resets the position in the stream back to the start.
def reset
@tell = 0
return [line+1, col]
end
# Read one character from the stream or queue if available. Return
@ -203,11 +242,60 @@ module HTML5lib
unless @queue.empty?
return @queue.shift
else
begin
@tell += 1
return @data_stream[@tell - 1].chr
rescue
return :EOF
c = @data_stream[@tell]
@tell += 1
case c
when 0x01 .. 0x7F
if c == 0x0D
# normalize newlines
@tell += 1 if @data_stream[@tell] == 0x0A
c = 0x0A
end
# update position in stream
if c == 0x0a
@line_lengths << @col
@line += 1
@col = 0
else
@col += 1
end
c.chr
when 0x80 .. 0xBF
if !@win1252
[0xFFFD].pack('U') # invalid utf-8
elsif c <= 0x9f
[ENTITIES_WINDOWS1252[c-0x80]].pack('U')
else
"\xC2" + c.chr # convert to utf-8
end
when 0xC0 .. 0xFF
if @win1252
"\xC3" + (c-64).chr # convert to utf-8
elsif @data_stream[@tell-1 .. -1] =~ /^
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)/x
@tell += $1.length - 1
$1
else
[0xFFFD].pack('U') # invalid utf-8
end
when 0x00
[0xFFFD].pack('U') # null characters are invalid
else
:EOF
end
end
end
@ -218,28 +306,15 @@ module HTML5lib
def chars_until(characters, opposite=false)
char_stack = [char]
unless char_stack[0] == :EOF
while (characters.include? char_stack[-1]) == opposite
unless @queue.empty?
# First from the queue
char_stack.push(@queue.shift)
break if char_stack[-1] == :EOF
else
# Then the rest
begin
char_stack.push(@data_stream[@tell].chr)
@tell += 1
rescue
char_stack.push(:EOF)
break
end
end
end
while char_stack.last != :EOF
break unless (characters.include?(char_stack.last)) == opposite
char_stack.push(char)
end
# Put the character stopped on back to the front of the queue
# from where it came.
@queue.insert(0, char_stack.pop)
c = char_stack.pop
@queue.insert(0, c) unless c == :EOF
return char_stack.join('')
end
end
@ -428,7 +503,7 @@ module HTML5lib
space_found = false
#Step 5 attribute name
while true
if @data.current_byte == '=' and attr_name:
if @data.current_byte == '=' and attr_name
break
elsif SPACE_CHARACTERS.include?(@data.current_byte)
space_found = true

View file

@ -69,15 +69,22 @@ module HTML5lib
# ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted
if token[:type] == :EndTag and \
not VOID_ELEMENTS.include? token[:name] and \
token[:name] == @tree.openElements[-1].name and \
not @tree.openElements[-1].hasContent
@tree.insertText('') unless
@tree.openElements.any? {|e|
e.attributes.keys.include? 'xmlns' and
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
}
if token[:type] == :EndTag
if VOID_ELEMENTS.include? token[:name]
if @tree.openElements[-1].name != token["name"]:
token[:type] = :EmptyTag
token["data"] ||= {}
end
else
if token[:name] == @tree.openElements[-1].name and \
not @tree.openElements[-1].hasContent
@tree.insertText('') unless
@tree.openElements.any? {|e|
e.attributes.keys.include? 'xmlns' and
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
}
end
end
end
return token

View file

@ -1,12 +1,22 @@
require 'html5lib/tokenizer'
require 'cgi'
module HTML5lib
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes.
#
# It can be either at the Tokenizer stage:
#
# HTMLParser.parse(html, :tokenizer => HTMLSanitizer)
#
# or, if you already have a parse tree (in this example, a REXML tree),
# at the Serializer stage:
#
# tokens = TreeWalkers.getTreeWalker('rexml').new(tree)
# HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
# :sanitize => true})
class HTMLSanitizer < HTMLTokenizer
module HTMLSanitizeModule
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
button caption center cite code col colgroup dd del dfn dir div dl dt
@ -64,7 +74,7 @@ module HTML5lib
xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
xmlns:xlink y y1 y2 zoomAndPan]
ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href]
ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href xml:base]
ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
border-bottom-color border-collapse border-color border-left-color
@ -96,19 +106,7 @@ module HTML5lib
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
# attributes are parsed, and a restricted set, # specified by
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
# in ALLOWED_PROTOCOLS are allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def each
super do |token|
def sanitize_token(token)
case token[:type]
when :StartTag, :EndTag, :EmptyTag
if ALLOWED_ELEMENTS.include?(token[:name])
@ -116,7 +114,7 @@ module HTML5lib
attrs = Hash[*token[:data].flatten]
attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
ATTR_VAL_IS_URI.each do |attr|
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
attrs.delete attr
end
@ -126,7 +124,7 @@ module HTML5lib
end
token[:data] = attrs.map {|k,v| [k,v]}
end
yield token
return token
else
if token[:type] == :EndTag
token[:data] = "</#{token[:name]}>"
@ -139,12 +137,14 @@ module HTML5lib
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
token[:type] = :Characters
token.delete(:name)
yield token
return token
end
when :Comment
token[:data] = ""
return token
else
yield token
return token
end
end
end
def sanitize_css(style)
@ -174,4 +174,14 @@ module HTML5lib
style = clean.join(' ')
end
end
class HTMLSanitizer < HTMLTokenizer
include HTMLSanitizeModule
def each
super do |token|
yield(sanitize_token(token))
end
end
end
end

View file

@ -0,0 +1,2 @@
require 'html5lib/serializer/htmlserializer'
require 'html5lib/serializer/xhtmlserializer'

View file

@ -0,0 +1,177 @@
require 'html5lib/constants'
module HTML5lib
class HTMLSerializer
def self.serialize(stream, options = {})
new(options).serialize(stream, options[:encoding])
end
def escape(string)
string.gsub("&", "&amp;").gsub("<", "&lt;").gsub(">", "&gt;")
end
def initialize(options={})
@quote_attr_values = false
@quote_char = '"'
@use_best_quote_char = true
@minimize_boolean_attributes = true
@use_trailing_solidus = false
@space_before_trailing_solidus = true
@escape_lt_in_attrs = false
@omit_optional_tags = true
@sanitize = false
@strip_whitespace = false
@inject_meta_charset = true
options.each do |name, value|
next unless instance_variables.include?("@#{name}")
@use_best_quote_char = false if name.to_s == 'quote_char'
instance_variable_set("@#{name}", value)
end
@errors = []
end
def serialize(treewalker, encoding=nil)
in_cdata = false
@errors = []
if encoding and @inject_meta_charset
require 'html5lib/filters/inject_meta_charset'
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
end
if @strip_whitespace
require 'html5lib/filters/whitespace'
treewalker = Filters::WhitespaceFilter.new(treewalker)
end
if @sanitize
require 'html5lib/filters/sanitizer'
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
end
if @omit_optional_tags
require 'html5lib/filters/optionaltags'
treewalker = Filters::OptionalTagFilter.new(treewalker)
end
result = []
treewalker.each do |token|
type = token[:type]
if type == :Doctype
doctype = "<!DOCTYPE %s>" % token[:name]
result << doctype
elsif [:Characters, :SpaceCharacters].include? type
if type == :SpaceCharacters or in_cdata
if in_cdata and token[:data].include?("</")
serializeError(_("Unexpected </ in CDATA"))
end
result << token[:data]
else
result << escape(token[:data])
end
elsif [:StartTag, :EmptyTag].include? type
name = token[:name]
if RCDATA_ELEMENTS.include?(name)
in_cdata = true
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
attributes = []
for k,v in attrs = token[:data].to_a.sort
attributes << ' '
attributes << k
if not @minimize_boolean_attributes or \
(!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
and !BOOLEAN_ATTRIBUTES[:global].include?(k))
attributes << "="
if @quote_attr_values or v.empty?
quote_attr = true
else
quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
end
v = v.gsub("&", "&amp;")
v = v.gsub("<", "&lt;") if @escape_lt_in_attrs
if quote_attr
quote_char = @quote_char
if @use_best_quote_char
if v.index("'") and !v.index('"')
quote_char = '"'
elsif v.index('"') and !v.index("'")
quote_char = "'"
end
end
if quote_char == "'"
v = v.gsub("'", "&#39;")
else
v = v.gsub('"', "&quot;")
end
attributes << quote_char << v << quote_char
else
attributes << v
end
end
end
if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
if @space_before_trailing_solidus
attributes << " /"
else
attributes << "/"
end
end
result << "<%s%s>" % [name, attributes.join('')]
elsif type == :EndTag
name = token[:name]
if RCDATA_ELEMENTS.include?(name)
in_cdata = false
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
end_tag = "</#{name}>"
result << end_tag
elsif type == :Comment
data = token[:data]
serializeError(_("Comment contains --")) if data.index("--")
comment = "<!--%s-->" % token[:data]
result << comment
else
serializeError(token[:data])
end
end
if encoding and encoding != 'utf-8'
require 'iconv'
Iconv.iconv(encoding, 'utf-8', result.join('')).first
else
result.join('')
end
end
alias :render :serialize
def serializeError(data="XXX ERROR MESSAGE NEEDED")
# XXX The idea is to make data mandatory.
@errors.push(data)
if @strict
raise SerializeError
end
end
end
# Error in serialized tree
class SerializeError < Exception
end
end

View file

@ -0,0 +1,19 @@
require 'html5lib/serializer/htmlserializer'
module HTML5lib
class XHTMLSerializer < HTMLSerializer
DEFAULTS = {
:quote_attr_values => true,
:minimize_boolean_attributes => false,
:use_trailing_solidus => true,
:escape_lt_in_attrs => true,
:omit_optional_tags => false
}
def initialize(options={})
super(DEFAULTS.clone.update(options))
end
end
end

View file

@ -41,19 +41,31 @@ module HTML5lib
:attributeValueUnQuoted => :attributeValueUnQuotedState,
:bogusComment => :bogusCommentState,
:markupDeclarationOpen => :markupDeclarationOpenState,
:commentStart => :commentStartState,
:commentStartDash => :commentStartDashState,
:comment => :commentState,
:commentDash => :commentDashState,
:commentEndDash => :commentEndDashState,
:commentEnd => :commentEndState,
:doctype => :doctypeState,
:beforeDoctypeName => :beforeDoctypeNameState,
:doctypeName => :doctypeNameState,
:afterDoctypeName => :afterDoctypeNameState,
:beforeDoctypePublicIdentifier => :beforeDoctypePublicIdentifierState,
:doctypePublicIdentifierDoubleQuoted => :doctypePublicIdentifierDoubleQuotedState,
:doctypePublicIdentifierSingleQuoted => :doctypePublicIdentifierSingleQuotedState,
:afterDoctypePublicIdentifier => :afterDoctypePublicIdentifierState,
:beforeDoctypeSystemIdentifier => :beforeDoctypeSystemIdentifierState,
:doctypeSystemIdentifierDoubleQuoted => :doctypeSystemIdentifierDoubleQuotedState,
:doctypeSystemIdentifierSingleQuoted => :doctypeSystemIdentifierSingleQuotedState,
:afterDoctypeSystemIdentifier => :afterDoctypeSystemIdentifierState,
:bogusDoctype => :bogusDoctypeState
}
# Setup the initial tokenizer state
@contentModelFlag = :PCDATA
@state = @states[:data]
@escapeFlag = false
@lastFourChars = []
# The current token being created
@currentToken = nil
@ -68,7 +80,6 @@ module HTML5lib
# to return we yield the token which pauses processing until the next token
# is requested.
def each
@stream.reset
@tokenQueue = []
# Start processing. When EOF is reached @state will return false
# instead of true and the loop will terminate.
@ -134,24 +145,14 @@ module HTML5lib
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
# smaller) we need to do the "windows trick".
if (127...160).include? charAsInt
#XXX - removed parse error from windows 1252 entity for now
#we may want to reenable this later
#@tokenQueue.push({:type => :ParseError, :data =>
# _("Entity used with illegal number (windows-1252 reference).")})
@tokenQueue.push({:type => :ParseError, :data =>
_("Entity used with illegal number (windows-1252 reference).")})
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
end
# 0 is not a good number.
if charAsInt == 0
charAsInt = 65533
end
if charAsInt <= 0x10FFFF
if charAsInt > 0 and charAsInt <= 1114111
char = [charAsInt].pack('U')
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity couldn't be converted to character.")})
end
# Discard the ; if present. Otherwise, put it back on the queue and
@ -168,7 +169,10 @@ module HTML5lib
def consumeEntity
char = nil
charStack = [@stream.char]
if charStack[0] == "#"
if SPACE_CHARACTERS.include?(charStack[0]) or
[:EOF, '<', '&'].include?(charStack[0])
@stream.queue+= charStack
elsif charStack[0] == "#"
# We might have a number entity here.
charStack += [@stream.char, @stream.char]
if charStack.include? :EOF
@ -195,10 +199,6 @@ module HTML5lib
_("Numeric entity expected but none found.")})
end
end
# Break out if we reach the end of the file
elsif charStack[0] == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Entity expected. Got end of file instead.")})
else
# At this point in the process might have named entity. Entities
# are stored in the global variable "entities".
@ -268,14 +268,33 @@ module HTML5lib
# statements should be.
def dataState
data = @stream.char
if data == "&" and (@contentModelFlag == :PCDATA or
@contentModelFlag == :RCDATA)
if @contentModelFlag == :CDATA or @contentModelFlag == :RCDATA
@lastFourChars << data
@lastFourChars.shift if @lastFourChars.length > 4
end
if data == "&" and [:PCDATA,:RCDATA].include?(@contentModelFlag)
@state = @states[:entityData]
elsif data == "<" and @contentModelFlag != :PLAINTEXT
@state = @states[:tagOpen]
elsif data == "-" and [:CDATA,:RCDATA].include?(@contentModelFlag) and
@escapeFlag == false and @lastFourChars.join('') == "<!--"
@escapeFlag = true
@tokenQueue.push({:type => :Characters, :data => data})
elsif data == "<" and @escapeFlag == false and
[:PCDATA,:CDATA,:RCDATA].include?(@contentModelFlag)
@state = @states[:tagOpen]
elsif data == ">" and [:CDATA,:RCDATA].include?(@contentModelFlag) and
@escapeFlag == true and @lastFourChars[1..-1].join('') == "-->"
@escapeFlag = false
@tokenQueue.push({:type => :Characters, :data => data})
elsif data == :EOF
# Tokenization ends.
return false
elsif SPACE_CHARACTERS.include? data
# Directly after emitting a token you switch back to the "data
# state". At that point SPACE_CHARACTERS are important so they are
@ -286,7 +305,7 @@ module HTML5lib
data + @stream.chars_until(SPACE_CHARACTERS, true)})
else
@tokenQueue.push({:type => :Characters, :data =>
data + @stream.chars_until(["&", "<"])})
data + @stream.chars_until(%w[& < > -])})
end
return true
end
@ -381,8 +400,6 @@ module HTML5lib
# emitting the end tag token.
@contentModelFlag = :PCDATA
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag after seeing '</'. None found.")})
@tokenQueue.push({:type => :Characters, :data => "</"})
@state = @states[:data]
@ -392,29 +409,27 @@ module HTML5lib
end
end
if @contentModelFlag == :PCDATA
data = @stream.char
if data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag. Unexpected end of file.")})
@tokenQueue.push({:type => :Characters, :data => "</"})
@state = @states[:data]
elsif ASCII_LETTERS.include? data
@currentToken =\
{:type => :EndTag, :name => data, :data => []}
@state = @states[:tagName]
elsif data == ">"
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
@state = @states[:data]
else
# XXX data can be _'_...
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag. Unexpected character '" + data + "' found.")})
@stream.queue.push(data)
@state = @states[:bogusComment]
end
data = @stream.char
if data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag. Unexpected end of file.")})
@tokenQueue.push({:type => :Characters, :data => "</"})
@state = @states[:data]
elsif ASCII_LETTERS.include? data
@currentToken = {:type => :EndTag, :name => data, :data => []}
@state = @states[:tagName]
elsif data == ">"
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
@state = @states[:data]
else
# XXX data can be _'_...
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag. Unexpected character '#{data}' found.")})
@stream.queue.push(data)
@state = @states[:bogusComment]
end
return true
end
@ -431,11 +446,6 @@ module HTML5lib
@stream.chars_until(ASCII_LETTERS, true)
elsif data == ">"
emitCurrentToken
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character when getting the tag name.")})
emitCurrentToken
elsif data == "/"
processSolidusInTag
@state = @states[:beforeAttributeName]
@ -460,11 +470,6 @@ module HTML5lib
emitCurrentToken
elsif data == "/"
processSolidusInTag
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character. Expected attribute name instead.")})
emitCurrentToken
else
@currentToken[:data].push([data, ""])
@state = @states[:attributeName]
@ -495,12 +500,6 @@ module HTML5lib
elsif data == "/"
processSolidusInTag
@state = @states[:beforeAttributeName]
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character in attribute name.")})
emitCurrentToken
leavingThisState = false
else
@currentToken[:data][-1][0] += data
leavingThisState = false
@ -538,11 +537,6 @@ module HTML5lib
elsif data == "/"
processSolidusInTag
@state = @states[:beforeAttributeName]
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character. Expected = or end of tag.")})
emitCurrentToken
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file. Expected = or end of tag.")})
@ -567,11 +561,6 @@ module HTML5lib
@state = @states[:attributeValueSingleQuoted]
elsif data == ">"
emitCurrentToken
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character. Expected attribute value.")})
emitCurrentToken
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file. Expected attribute value.")})
@ -625,11 +614,6 @@ module HTML5lib
processEntityInAttribute
elsif data == ">"
emitCurrentToken
elsif data == "<"
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected < character in attribute value.")})
emitCurrentToken
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in attribute value.")})
@ -659,14 +643,15 @@ module HTML5lib
charStack = [@stream.char, @stream.char]
if charStack == ["-", "-"]
@currentToken = {:type => :Comment, :data => ""}
@state = @states[:comment]
@state = @states[:commentStart]
else
5.times { charStack.push(@stream.char) }
# Put in explicit :EOF check
if ((not charStack.include? :EOF) and
charStack.join("").upcase == "DOCTYPE")
@currentToken =\
{:type => :Doctype, :name => "", :data => true}
{:type => :Doctype, :name => "",
:publicId => nil, :systemId => nil, :correct => true}
@state = @states[:doctype]
else
@tokenQueue.push({:type => :ParseError, :data =>
@ -678,10 +663,52 @@ module HTML5lib
return true
end
def commentStartState
data = @stream.char
if data == "-"
@state = @states[:commentStartDash]
elsif data == ">"
@tokenQueue.push({:type => :ParseError, :data =>
_("Incorrect comment.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in comment.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:data] += data + @stream.chars_until("-")
@state = @states[:comment]
end
return true
end
def commentStartDashState
data = @stream.char
if data == "-"
@state = @states[:commentEnd]
elsif data == ">"
@tokenQueue.push({:type => :ParseError, :data =>
_("Incorrect comment.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in comment.")})
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:data] += data + @stream.chars_until("-")
@state = @states[:comment]
end
return true
end
def commentState
data = @stream.char
if data == "-"
@state = @states[:commentDash]
@state = @states[:commentEndDash]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in comment.")})
@ -693,7 +720,7 @@ module HTML5lib
return true
end
def commentDashState
def commentEndDashState
data = @stream.char
if data == "-"
@state = @states[:commentEnd]
@ -753,19 +780,16 @@ module HTML5lib
def beforeDoctypeNameState
data = @stream.char
if SPACE_CHARACTERS.include? data
elsif ASCII_LOWERCASE.include? data
@currentToken[:name] = data.upcase
@state = @states[:doctypeName]
elsif data == ">"
# Character needs to be consumed per the specification so don't
# invoke emitCurrentTokenWithParseError with :data as argument.
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected > character. Expected DOCTYPE name.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file. Expected DOCTYPE name.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@ -777,33 +801,21 @@ module HTML5lib
def doctypeNameState
data = @stream.char
needsDoctypeCheck = false
if SPACE_CHARACTERS.include? data
@state = @states[:afterDoctypeName]
needsDoctypeCheck = true
elsif data == ">"
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE name.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
# We can't just uppercase everything that arrives here. For
# instance, non-ASCII characters.
if ASCII_LOWERCASE.include? data
data = data.upcase
end
@currentToken[:name] += data
needsDoctypeCheck = true
end
# After some iterations through this state it should eventually say
# "HTML". Otherwise there's an error.
if needsDoctypeCheck and @currentToken[:name] == "HTML"
@currentToken[:data] = false
end
return true
end
@ -815,16 +827,195 @@ module HTML5lib
@state = @states[:data]
elsif data == :EOF
@currentToken[:data] = true
# XXX EMIT
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
charStack = [data]
5.times { charStack << stream.char }
token = charStack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
if token == "public"
@state = @states[:beforeDoctypePublicIdentifier]
elsif token == "system"
@state = @states[:beforeDoctypeSystemIdentifier]
else
@stream.queue += charStack
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected 'public' or 'system'. Got '#{charStack.join('')}'")})
@state = @states[:bogusDoctype]
end
end
return true
end
def beforeDoctypePublicIdentifierState
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@currentToken[:publicId] = ""
@state = @states[:doctypePublicIdentifierDoubleQuoted]
elsif data == "'"
@currentToken[:publicId] = ""
@state = @states[:doctypePublicIdentifierSingleQuoted]
elsif data == ">"
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected space or '>'. Got '" + data + "'")})
@currentToken[:data] = true
_("Unexpected character in DOCTYPE.")})
@state = @states[:bogusDoctype]
end
return true
end
def doctypePublicIdentifierDoubleQuotedState
data = @stream.char
if data == "\""
@state = @states[:afterDoctypePublicIdentifier]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:publicId] += data
end
return true
end
def doctypePublicIdentifierSingleQuotedState
data = @stream.char
if data == "'"
@state = @states[:afterDoctypePublicIdentifier]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:publicId] += data
end
return true
end
def afterDoctypePublicIdentifierState
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@currentToken[:systemId] = ""
@state = @states[:doctypeSystemIdentifierDoubleQuoted]
elsif data == "'"
@currentToken[:systemId] = ""
@state = @states[:doctypeSystemIdentifierSingleQuoted]
elsif data == ">"
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected character in DOCTYPE.")})
@state = @states[:bogusDoctype]
end
return true
end
def beforeDoctypeSystemIdentifierState
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@currentToken[:systemId] = ""
@state = @states[:doctypeSystemIdentifierDoubleQuoted]
elsif data == "'"
@currentToken[:systemId] = ""
@state = @states[:doctypeSystemIdentifierSingleQuoted]
elsif data == ">"
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected character in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected character in DOCTYPE.")})
@state = @states[:bogusDoctype]
end
return true
end
def doctypeSystemIdentifierDoubleQuotedState
data = @stream.char
if data == "\""
@state = @states[:afterDoctypeSystemIdentifier]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:systemId] += data
end
return true
end
def doctypeSystemIdentifierSingleQuotedState
data = @stream.char
if data == "'"
@state = @states[:afterDoctypeSystemIdentifier]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@currentToken[:systemId] += data
end
return true
end
def afterDoctypeSystemIdentifierState
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == ">"
@tokenQueue.push(@currentToken)
@state = @states[:data]
elsif data == :EOF
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected character in DOCTYPE.")})
@state = @states[:bogusDoctype]
end
return true
@ -840,6 +1031,7 @@ module HTML5lib
@stream.queue.push(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in bogus doctype.")})
@currentToken[:correct] = false
@tokenQueue.push(@currentToken)
@state = @states[:data]
end

View file

@ -1,21 +1,24 @@
module HTML5lib
module TreeBuilders
def self.getTreeBuilder(name)
case name.to_s.downcase
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treebuilders/simpletree'
SimpleTree::TreeBuilder
when 'rexml' then
require 'html5lib/treebuilders/rexml'
REXMLTree::TreeBuilder
REXML::TreeBuilder
when 'hpricot' then
require 'html5lib/treebuilders/hpricot'
Hpricot::TreeBuilder
else
raise "Unknown TreeBuilder #{name}"
end
end
end
alias :getTreeBuilder :[]
end
end
end

View file

@ -144,7 +144,7 @@ module HTML5lib
# code. It should still do the same though.
# Step 1: stop the algorithm when there's nothing to do.
return unless @activeFormattingElements
return if @activeFormattingElements.empty?
# Step 2 and step 3: we start with the last element. So i is -1.
i = -1

View file

@ -1,4 +1,5 @@
require 'html5lib/treebuilders/base'
require 'rubygems'
require 'hpricot'
require 'forwardable'
@ -26,12 +27,17 @@ module HTML5lib
childNodes << node
hpricot.children << node.hpricot
end
if (oldparent = node.hpricot.parent) != nil
oldparent.children.delete_at(oldparent.children.index(node.hpricot))
end
node.hpricot.parent = hpricot
node.parent = self
end
def removeChild(node)
childNodes.delete(node)
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
node.hpricot.parent = nil
node.parent = nil
end
@ -48,6 +54,7 @@ module HTML5lib
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
else
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
childNodes.insert(index, node)
end
end

View file

@ -4,7 +4,7 @@ require 'forwardable'
module HTML5lib
module TreeBuilders
module REXMLTree
module REXML
class Node < Base::Node
extend Forwardable
@ -52,6 +52,7 @@ module HTML5lib
childNodes[index-1].rxobj.raw = true
else
childNodes.insert index, node
refNode.rxobj.parent.insert_before(refNode.rxobj,node.rxobj)
end
end
@ -62,7 +63,7 @@ module HTML5lib
class Element < Node
def self.rxclass
REXML::Element
::REXML::Element
end
def initialize name
@ -95,7 +96,7 @@ module HTML5lib
class Document < Node
def self.rxclass
REXML::Document
::REXML::Document
end
def initialize
@ -120,7 +121,7 @@ module HTML5lib
class DocumentType < Node
def self.rxclass
REXML::DocType
::REXML::DocType
end
def printTree indent=0
@ -145,7 +146,7 @@ module HTML5lib
class TextNode < Node
def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
@rxobj = REXML::Text.new(raw, true, nil, true)
@rxobj = ::REXML::Text.new(raw, true, nil, true)
end
def printTree indent=0
@ -155,7 +156,7 @@ module HTML5lib
class CommentNode < Node
def self.rxclass
REXML::Comment
::REXML::Comment
end
def printTree indent=0

View file

@ -78,7 +78,7 @@ module HTML5lib
class Element < Node
def to_s
"<%s>" % name
"<#{name}>"
end
def printTree indent=0

View file

@ -0,0 +1,26 @@
require 'html5lib/treewalkers/base'
module HTML5lib
module TreeWalkers
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treewalkers/simpletree'
SimpleTree::TreeWalker
when 'rexml' then
require 'html5lib/treewalkers/rexml'
REXML::TreeWalker
when 'hpricot' then
require 'html5lib/treewalkers/hpricot'
Hpricot::TreeWalker
else
raise "Unknown TreeWalker #{name}"
end
end
alias :getTreeWalker :[]
end
end
end

View file

@ -0,0 +1,156 @@
require 'html5lib/constants'
module HTML5lib
module TreeWalkers
module TokenConstructor
def error(msg)
return {:type => "SerializeError", :data => msg}
end
def normalizeAttrs(attrs)
attrs.to_a
end
def emptyTag(name, attrs, hasChildren=false)
error(_("Void element has children")) if hasChildren
return({:type => :EmptyTag, :name => name, \
:data => normalizeAttrs(attrs)})
end
def startTag(name, attrs)
return {:type => :StartTag, :name => name, \
:data => normalizeAttrs(attrs)}
end
def endTag(name)
return {:type => :EndTag, :name => name, :data => []}
end
def text(data)
if data =~ /\A([#{SPACE_CHARACTERS.join('')}]+)/m
yield({:type => :SpaceCharacters, :data => $1})
data = data[$1.length .. -1]
return if data.empty?
end
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)\Z/m
yield({:type => :Characters, :data => data[0 ... -$1.length]})
yield({:type => :SpaceCharacters, :data => $1})
else
yield({:type => :Characters, :data => data})
end
end
def comment(data)
return {:type => :Comment, :data => data}
end
def doctype(name)
return {:type => :Doctype, :name => name, :data => name.upcase() == "HTML"}
end
def unknown(nodeType)
return error(_("Unknown node type: ") + nodeType.to_s)
end
def _(str)
str
end
end
class Base
include TokenConstructor
def initialize(tree)
@tree = tree
end
def each
raise NotImplementedError
end
alias walk each
end
class NonRecursiveTreeWalker < TreeWalkers::Base
def node_details(node)
raise NotImplementedError
end
def first_child(node)
raise NotImplementedError
end
def next_sibling(node)
raise NotImplementedError
end
def parent(node)
raise NotImplementedError
end
def each
currentNode = @tree
while currentNode != nil
details = node_details(currentNode)
hasChildren = false
case details.shift
when :DOCTYPE
yield doctype(*details)
when :TEXT
text(*details) {|token| yield token}
when :ELEMENT
name, attributes, hasChildren = details
if VOID_ELEMENTS.include?(name)
yield emptyTag(name, attributes.to_a, hasChildren)
hasChildren = false
else
yield startTag(name, attributes.to_a)
end
when :COMMENT
yield comment(details[0])
when :DOCUMENT, :DOCUMENT_FRAGMENT
hasChildren = true
when nil
# ignore (REXML::XMLDecl is an example)
else
yield unknown(details[0])
end
firstChild = hasChildren ? first_child(currentNode) : nil
if firstChild != nil
currentNode = firstChild
else
while currentNode != nil
details = node_details(currentNode)
if details.shift == :ELEMENT
name, attributes, hasChildren = details
yield endTag(name) if !VOID_ELEMENTS.include?(name)
end
if @tree == currentNode
currentNode = nil
else
nextSibling = next_sibling(currentNode)
if nextSibling != nil
currentNode = nextSibling
break
end
currentNode = parent(currentNode)
end
end
end
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5lib/treewalkers/base'
require 'rexml/document'
module HTML5lib
module TreeWalkers
module Hpricot
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node
when ::Hpricot::Elem
if node.name.empty?
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value]},
!node.empty?]
end
when ::Hpricot::Text
[:TEXT, node.to_plain_text]
when ::Hpricot::Comment
[:COMMENT, node.content]
when ::Hpricot::Doc
[:DOCUMENT]
when ::Hpricot::DocType
[:DOCTYPE, node.target]
when ::Hpricot::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end
end
def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_node
end
def parent(node)
node.parent
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5lib/treewalkers/base'
require 'rexml/document'
module HTML5lib
module TreeWalkers
module REXML
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node
when ::REXML::Document
[:DOCUMENT]
when ::REXML::Element
if !node.name
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value]},
node.has_elements? || node.has_text?]
end
when ::REXML::Text
[:TEXT, node.value]
when ::REXML::Comment
[:COMMENT, node.string]
when ::REXML::DocType
[:DOCTYPE, node.name]
when ::REXML::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end
end
def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_sibling
end
def parent(node)
node.parent
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5lib/treewalkers/base'
module HTML5lib
module TreeWalkers
module SimpleTree
class TreeWalker < HTML5lib::TreeWalkers::Base
include HTML5lib::TreeBuilders::SimpleTree
def walk(node)
case node
when Document, DocumentFragment
return
when DocumentType
yield doctype(node.name)
when TextNode
text(node.value) {|token| yield token}
when Element
if VOID_ELEMENTS.include?(node.name)
yield emptyTag(node.name, node.attributes, node.hasContent())
else
yield startTag(node.name, node.attributes)
for child in node.childNodes
walk(child) {|token| yield token}
end
yield endTag(node.name)
end
when CommentNode
yield comment(node.value)
else
puts '?'
yield unknown(node.class)
end
end
def each
for child in @tree.childNodes
walk(child) {|node| yield node}
end
end
end
end
end
end

213
vendor/plugins/HTML5lib/parse.rb vendored Executable file
View file

@ -0,0 +1,213 @@
#!/usr/bin/env ruby
#
# Parse a document to a simpletree tree, with optional profiling
$:.unshift File.dirname(__FILE__),'lib'
def parse(opts, args)
encoding = nil
f = args[-1]
if f
begin
if f[0..6] == 'http://'
require 'open-uri'
f = URI.parse(f).open
encoding = f.charset
elsif f == '-'
f = $stdin
else
f = open(f)
end
rescue
end
else
$stderr.write("No filename provided. Use -h for help\n")
exit(1)
end
require 'html5lib/treebuilders'
treebuilder = HTML5lib::TreeBuilders[opts.treebuilder]
if opts.output == :xml
require 'html5lib/liberalxmlparser'
p = HTML5lib::XHTMLParser.new(:tree=>treebuilder)
else
require 'html5lib/html5parser'
p = HTML5lib::HTMLParser.new(:tree=>treebuilder)
end
if opts.parsemethod == :parse
args = [f, encoding]
else
args = [f, 'div', encoding]
end
if opts.profile
require 'profiler'
Profiler__::start_profile
p.send(opts.parsemethod, *args)
Profiler__::stop_profile
Profiler__::print_profile($stderr)
elsif opts.time
require 'time'
t0 = Time.new
document = p.send(opts.parsemethod, *args)
t1 = Time.new
printOutput(p, document, opts)
t2 = Time.new
puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
else
document = p.send(opts.parsemethod, *args)
printOutput(p, document, opts)
end
end
def printOutput(parser, document, opts)
puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
case opts.output
when :xml
print document
when :html
require 'html5lib/treewalkers'
tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
require 'html5lib/serializer'
puts HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer)
when :hilite
print document.hilite
when :tree
document = [document] unless document.respond_to?(:each)
document.each {|fragment| puts parser.tree.testSerializer(fragment)}
end
if opts.error
errList=[]
for pos, message in parser.errors
errList << ("Line %i Col %i"%pos + " " + message)
end
$stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
end
end
require 'ostruct'
options = OpenStruct.new
options.profile = false
options.time = false
options.output = :html
options.treebuilder = 'simpletree'
options.error = false
options.encoding = false
options.parsemethod = :parse
options.serializer = {
:encoding => 'utf-8',
:omit_optional_tags => false,
:inject_meta_charset => false
}
require 'optparse'
opts = OptionParser.new do |opts|
opts.separator ""
opts.separator "Parse Options:"
opts.on("-b", "--treebuilder NAME") do |treebuilder|
options.treebuilder = treebuilder
end
opts.on("-f", "--fragment", "Parse as a fragment") do |parse|
options.parsemethod = :parseFragment
end
opts.separator ""
opts.separator "Filter Options:"
opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
options.serializer[:inject_meta_charset] = inject
end
opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
options.serializer[:strip_whitespace] = strip
end
opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
options.serializer[:sanitize] = sanitize
end
opts.separator ""
opts.separator "Output Options:"
opts.on("--tree", "output as debug tree") do |tree|
options.output = :tree
end
opts.on("-x", "--xml", "output as xml") do |xml|
options.output = :xml
options.treebuilder = "rexml"
end
opts.on("--[no-]html", "Output as html") do |html|
options.output = (html ? :html : nil)
end
opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
options.output = :hilite
end
opts.on("-e", "--error", "Print a list of parse errors") do |error|
options.error = error
end
opts.separator ""
opts.separator "Serialization Options:"
opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
options.serializer[:omit_optional_tags] = omit
end
opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
options.serializer[:quote_attr_values] = quote
end
opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
options.serializer[:use_best_quote_char] = best
end
opts.on("--quote-char C", "Use specified quote character") do |c|
options.serializer[:quote_char] = c
end
opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
options.serializer[:minimize_boolean_attributes] = min
end
opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
options.serializer[:use_trailing_solidus] = slash
end
opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
options.serializer[:escape_lt_in_attrs] = lt
end
opts.separator ""
opts.separator "Other Options:"
opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
options.profile = profile
end
opts.on("-t", "--[no-]time", "Time the run") do |time|
options.time = time
end
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
options.encoding = encoding
end
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
end
opts.parse!(ARGV)
parse options, ARGV

View file

@ -0,0 +1,51 @@
老子《道德經》 第一~四十章
老子道經
第一章
道可道,非常道。名可名,非常名。無,名天地之始﹔有,名萬物之母。
故常無,欲以觀其妙;常有,欲以觀其徼。此兩者,同出而異名,同謂之
玄。玄之又玄,眾妙之門。
第二章
天下皆知美之為美,斯惡矣﹔皆知善之為善,斯不善矣。故有無相生,難
易相成,長短相形,高下相傾,音聲相和,前後相隨。是以聖人處「無為
」之事,行「不言」之教。萬物作焉而不辭,生而不有,為而不恃,功成
而弗居。夫唯弗居,是以不去。
第三章
不尚賢,使民不爭﹔不貴難得之貨,使民不為盜﹔不見可欲,使民心不亂
。是以「聖人」之治,虛其心,實其腹,弱其志,強其骨。常使民無知無
欲。使夫智者不敢為也。為「無為」,則無不治。
第四章
「道」沖,而用之或不盈。淵兮,似萬物之宗﹔挫其銳,解其紛,和其光
,同其塵﹔湛兮似或存。吾不知誰之子?象帝之先。
第五章
天地不仁,以萬物為芻狗﹔聖人不仁,以百姓為芻狗。天地之間,其猶橐
蘥乎?虛而不屈,動而愈出。多言數窮,不如守中。
第六章
谷神不死,是謂玄牝。玄牝之門,是謂天地根。綿綿若存,用之不勤。
第七章
天長地久。天地所以能長且久者,以其不自生,故能長久。是以聖人後其
身而身先,外其身而身存。非以其無私邪?故能成其私。
第八章
上善若水。水善利萬物而不爭。處眾人之所惡,故幾於道。居善地,心善
淵,與善仁,言善信,政善治,事善能,動善時。夫唯不爭,故無尤。
第九章
持而盈之,不如其已﹔揣而銳之,不可長保。金玉滿堂,莫之能守﹔富貴
而驕,自遺其咎。功遂身退,天之道。

View file

@ -0,0 +1,10 @@
#data
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=euc-jp">
<!--京-->
<title>Yahoo! JAPAN</title>
<meta name="description" content="日本最大級のポータルサイト。検索、オークション、ニュース、メール、コミュニティ、ショッピング、など80以上のサービスを展開。あなたの生活をより豊かにする「ライフ・エンジン」を目指していきます。">
<style type="text/css" media="all">
#encoding
euc-jp

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,82 @@
#data
<meta
#encoding
windows-1252
#data
<
#encoding
windows-1252
#data
<!
#encoding
windows-1252
#data
<meta charset = "
#encoding
windows-1252
#data
<meta charset=EUC-jp
#encoding
windows-1252
#data
<meta <meta charset='EUC-jp'>
#encoding
EUC-jp
#data
<meta charset = 'EUC-jp'>
#encoding
EUC-jp
#data
<!-- -->
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
#encoding
utf-8
#data
<!-- -->
<meta http-equiv="Content-Type" content="text/html; charset=utf
#encoding
windows-1252
#data
<meta http-equiv="Content-Type<meta charset="utf-8">
#encoding
windows-1252
#data
<meta http-equiv="Content-Type" content="text/html; charset='utf-8'">
#encoding
utf-8
#data
<meta http-equiv="Content-Type" content="text/html; charset='utf-8">
#encoding
windows-1252
#data
<meta
#encoding
windows-1252
#data
<meta charset =
#encoding
windows-1252
#data
<meta charset= utf-8
#encoding
windows-1252
#data
<meta content = "text/html;
#encoding
windows-1252

View file

@ -0,0 +1,409 @@
[
{
"name": "IE_Comments",
"input": "<!--[if gte IE 4]><script>alert('XSS');</script><![endif]-->",
"output": ""
},
{
"name": "IE_Comments_2",
"input": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
"output": "&lt;script&gt;alert('XSS');&lt;/script&gt;",
"rexml": "Ill-formed XHTML!"
},
{
"name": "allow_colons_in_path_component",
"input": "<a href=\"./this:that\">foo</a>",
"output": "<a href='./this:that'>foo</a>"
},
{
"name": "background_attribute",
"input": "<div background=\"javascript:alert('XSS')\"></div>",
"output": "<div/>",
"xhtml": "<div></div>",
"rexml": "<div></div>"
},
{
"name": "bgsound",
"input": "<bgsound src=\"javascript:alert('XSS');\" />",
"output": "&lt;bgsound src=\"javascript:alert('XSS');\"/&gt;",
"rexml": "&lt;bgsound src=\"javascript:alert('XSS');\"&gt;&lt;/bgsound&gt;"
},
{
"name": "div_background_image_unicode_encoded",
"input": "<div style=\"background-image:\u00a5\u00a2\u006C\u0028'\u006a\u0061\u00a6\u0061\u00a3\u0063\u00a2\u0069\u00a0\u00a4\u003a\u0061\u006c\u0065\u00a2\u00a4\u0028.1027\u0058.1053\u0053\u0027\u0029'\u0029\">foo</div>",
"output": "<div style=''>foo</div>"
},
{
"name": "div_expression",
"input": "<div style=\"width: expression(alert('XSS'));\">foo</div>",
"output": "<div style=''>foo</div>"
},
{
"name": "double_open_angle_brackets",
"input": "<img src=http://ha.ckers.org/scriptlet.html <",
"output": "<img src='http://ha.ckers.org/scriptlet.html'/>",
"rexml": "Ill-formed XHTML!"
},
{
"name": "double_open_angle_brackets_2",
"input": "<script src=http://ha.ckers.org/scriptlet.html <",
"output": "&lt;script src=\"http://ha.ckers.org/scriptlet.html\" &lt;=\"\"&gt;",
"rexml": "Ill-formed XHTML!"
},
{
"name": "grave_accents",
"input": "<img src=`javascript:alert('XSS')` />",
"output": "<img/>",
"rexml": "Ill-formed XHTML!"
},
{
"name": "img_dynsrc_lowsrc",
"input": "<img dynsrc=\"javascript:alert('XSS')\" />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "img_vbscript",
"input": "<img src='vbscript:msgbox(\"XSS\")' />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "input_image",
"input": "<input type=\"image\" src=\"javascript:alert('XSS');\" />",
"output": "<input type='image'/>",
"rexml": "<input type='image' />"
},
{
"name": "link_stylesheets",
"input": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\" />",
"output": "&lt;link rel=\"stylesheet\" href=\"javascript:alert('XSS');\"/&gt;",
"rexml": "&lt;link href=\"javascript:alert('XSS');\" rel=\"stylesheet\"/&gt;"
},
{
"name": "link_stylesheets_2",
"input": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\" />",
"output": "&lt;link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\"/&gt;",
"rexml": "&lt;link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\"/&gt;"
},
{
"name": "list_style_image",
"input": "<li style=\"list-style-image: url(javascript:alert('XSS'))\">foo</li>",
"output": "<li style=''>foo</li>"
},
{
"name": "no_closing_script_tags",
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;",
"rexml": "Ill-formed XHTML!"
},
{
"name": "non_alpha_non_digit",
"input": "<script/XSS src=\"http://ha.ckers.org/xss.js\"></script>",
"output": "&lt;script XSS=\"\" src=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;",
"rexml": "Ill-formed XHTML!"
},
{
"name": "non_alpha_non_digit_2",
"input": "<a onclick!\\#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>",
"output": "<a>foo</a>",
"rexml": "Ill-formed XHTML!"
},
{
"name": "non_alpha_non_digit_3",
"input": "<img/src=\"http://ha.ckers.org/xss.js\"/>",
"output": "<img src='http://ha.ckers.org/xss.js'/>",
"rexml": "Ill-formed XHTML!"
},
{
"name": "non_alpha_non_digit_II",
"input": "<a href!\\#$%&()*~+-_.,:;?@[/|]^`=alert('XSS')>foo</a>",
"output": "<a>foo</a>",
"rexml": "Ill-formed XHTML!"
},
{
"name": "non_alpha_non_digit_III",
"input": "<a/href=\"javascript:alert('XSS');\">foo</a>",
"output": "<a>foo</a>",
"rexml": "Ill-formed XHTML!"
},
{
"name": "platypus",
"input": "<a href=\"http://www.ragingplatypus.com/\" style=\"display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;\">never trust your upstream platypus</a>",
"output": "<a href='http://www.ragingplatypus.com/' style='display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;'>never trust your upstream platypus</a>"
},
{
"name": "protocol_resolution_in_script_tag",
"input": "<script src=//ha.ckers.org/.j></script>",
"output": "&lt;script src=\"//ha.ckers.org/.j\"&gt;&lt;/script&gt;",
"rexml": "Ill-formed XHTML!"
},
{
"name": "should_allow_anchors",
"input": "<a href='foo' onclick='bar'><script>baz</script></a>",
"output": "<a href='foo'>&lt;script&gt;baz&lt;/script&gt;</a>"
},
{
"name": "should_allow_image_alt_attribute",
"input": "<img alt='foo' onclick='bar' />",
"output": "<img alt='foo'/>",
"rexml": "<img alt='foo' />"
},
{
"name": "should_allow_image_height_attribute",
"input": "<img height='foo' onclick='bar' />",
"output": "<img height='foo'/>",
"rexml": "<img height='foo' />"
},
{
"name": "should_allow_image_src_attribute",
"input": "<img src='foo' onclick='bar' />",
"output": "<img src='foo'/>",
"rexml": "<img src='foo' />"
},
{
"name": "should_allow_image_width_attribute",
"input": "<img width='foo' onclick='bar' />",
"output": "<img width='foo'/>",
"rexml": "<img width='foo' />"
},
{
"name": "should_handle_blank_text",
"input": "",
"output": ""
},
{
"name": "should_handle_malformed_image_tags",
"input": "<img \"\"\"><script>alert(\"XSS\")</script>\">",
"output": "<img/>&lt;script&gt;alert(\"XSS\")&lt;/script&gt;\"&gt;",
"rexml": "Ill-formed XHTML!"
},
{
"name": "should_handle_non_html",
"input": "abc",
"output": "abc"
},
{
"name": "should_not_fall_for_ridiculous_hack",
"input": "<img\nsrc\n=\n\"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n\"\n />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_0",
"input": "<img src=\"javascript:alert('XSS');\" />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_1",
"input": "<img src=javascript:alert('XSS') />",
"output": "<img/>",
"rexml": "Ill-formed XHTML!"
},
{
"name": "should_not_fall_for_xss_image_hack_10",
"input": "<img src=\"jav&#x0A;ascript:alert('XSS');\" />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_11",
"input": "<img src=\"jav&#x0D;ascript:alert('XSS');\" />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_12",
"input": "<img src=\" &#14; javascript:alert('XSS');\" />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_13",
"input": "<img src=\"&#x20;javascript:alert('XSS');\" />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_14",
"input": "<img src=\"&#xA0;javascript:alert('XSS');\" />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_2",
"input": "<img src=\"JaVaScRiPt:alert('XSS')\" />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_3",
"input": "<img src='javascript:alert(&quot;XSS&quot;)' />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_4",
"input": "<img src='javascript:alert(String.fromCharCode(88,83,83))' />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_5",
"input": "<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_6",
"input": "<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_7",
"input": "<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_8",
"input": "<img src=\"jav\tascript:alert('XSS');\" />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_not_fall_for_xss_image_hack_9",
"input": "<img src=\"jav&#x09;ascript:alert('XSS');\" />",
"output": "<img/>",
"rexml": "<img />"
},
{
"name": "should_sanitize_half_open_scripts",
"input": "<img src=\"javascript:alert('XSS')\"",
"output": "<img/>",
"rexml": "Ill-formed XHTML!"
},
{
"name": "should_sanitize_invalid_script_tag",
"input": "<script/XSS SRC=\"http://ha.ckers.org/xss.js\"></script>",
"output": "&lt;script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;",
"rexml": "Ill-formed XHTML!"
},
{
"name": "should_sanitize_script_tag_with_multiple_open_brackets",
"input": "<<script>alert(\"XSS\");//<</script>",
"output": "&lt;&lt;script&gt;alert(\"XSS\");//&lt;&lt;/script&gt;",
"rexml": "Ill-formed XHTML!"
},
{
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
"input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<",
"output": "&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\" &lt;=\"\"&gt;",
"rexml": "Ill-formed XHTML!"
},
{
"name": "should_sanitize_tag_broken_up_by_null",
"input": "<scr\u0000ipt>alert(\"XSS\")</scr\u0000ipt>",
"output": "&lt;scr\ufffdipt&gt;alert(\"XSS\")&lt;/scr\ufffdipt&gt;",
"rexml": "Ill-formed XHTML!"
},
{
"name": "should_sanitize_unclosed_script",
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;",
"rexml": "Ill-formed XHTML!"
},
{
"name": "should_strip_href_attribute_in_a_with_bad_protocols",
"input": "<a href=\"javascript:XSS\" title=\"1\">boo</a>",
"output": "<a title='1'>boo</a>"
},
{
"name": "should_strip_href_attribute_in_a_with_bad_protocols_and_whitespace",
"input": "<a href=\" javascript:XSS\" title=\"1\">boo</a>",
"output": "<a title='1'>boo</a>"
},
{
"name": "should_strip_src_attribute_in_img_with_bad_protocols",
"input": "<img src=\"javascript:XSS\" title=\"1\">boo</img>",
"output": "<img title='1'/>boo",
"rexml": "<img title='1' />"
},
{
"name": "should_strip_src_attribute_in_img_with_bad_protocols_and_whitespace",
"input": "<img src=\" javascript:XSS\" title=\"1\">boo</img>",
"output": "<img title='1'/>boo",
"rexml": "<img title='1' />"
},
{
"name": "xml_base",
"input": "<div xml:base=\"javascript:alert('XSS');//\">foo</div>",
"output": "<div>foo</div>"
},
{
"name": "xul",
"input": "<p style=\"-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')\">fubar</p>",
"output": "<p style=''>fubar</p>"
}
]

View file

@ -0,0 +1,103 @@
{"tests": [
{"description": "proper attribute value escaping",
"input": [["StartTag", "span", {"title": "test \"with\" &quot;"}]],
"expected": ["<span title='test \"with\" &amp;quot;'>"]
},
{"description": "proper attribute value non-quoting",
"input": [["StartTag", "span", {"title": "foo"}]],
"expected": ["<span title=foo>"],
"xhtml": ["<span title=\"foo\">"]
},
{"description": "proper attribute value quoting (with >)",
"input": [["StartTag", "span", {"title": "foo>bar"}]],
"expected": ["<span title=\"foo>bar\">"]
},
{"description": "proper attribute value quoting (with <)",
"input": [["StartTag", "span", {"title": "foo<bar"}]],
"expected": ["<span title=\"foo<bar\">"],
"xhtml": ["<span title=\"foo&lt;bar\">"]
},
{"description": "proper attribute value quoting (with \")",
"input": [["StartTag", "span", {"title": "foo\"bar"}]],
"expected": ["<span title='foo\"bar'>"]
},
{"description": "proper attribute value quoting (with ')",
"input": [["StartTag", "span", {"title": "foo'bar"}]],
"expected": ["<span title=\"foo'bar\">"]
},
{"description": "proper attribute value quoting (with both \" and ')",
"input": [["StartTag", "span", {"title": "foo'bar\"baz"}]],
"expected": ["<span title=\"foo'bar&quot;baz\">"]
},
{"description": "proper attribute value quoting (with space)",
"input": [["StartTag", "span", {"title": "foo bar"}]],
"expected": ["<span title=\"foo bar\">"]
},
{"description": "proper attribute value quoting (with tab)",
"input": [["StartTag", "span", {"title": "foo\tbar"}]],
"expected": ["<span title=\"foo\tbar\">"]
},
{"description": "proper attribute value quoting (with LF)",
"input": [["StartTag", "span", {"title": "foo\nbar"}]],
"expected": ["<span title=\"foo\nbar\">"]
},
{"description": "proper attribute value quoting (with CR)",
"input": [["StartTag", "span", {"title": "foo\rbar"}]],
"expected": ["<span title=\"foo\rbar\">"]
},
{"description": "proper attribute value quoting (with linetab)",
"input": [["StartTag", "span", {"title": "foo\u000Bbar"}]],
"expected": ["<span title=\"foo\u000Bbar\">"]
},
{"description": "proper attribute value quoting (with form feed)",
"input": [["StartTag", "span", {"title": "foo\u000Cbar"}]],
"expected": ["<span title=\"foo\u000Cbar\">"]
},
{"description": "void element (as EmptyTag token)",
"input": [["EmptyTag", "img", {}]],
"expected": ["<img>"],
"xhtml": ["<img />"]
},
{"description": "void element (as StartTag token)",
"input": [["StartTag", "img", {}]],
"expected": ["<img>"],
"xhtml": ["<img />"]
},
{"description": "doctype in error",
"input": [["Doctype", "foo"]],
"expected": ["<!DOCTYPE foo>"]
},
{"description": "character data",
"options": {"encoding":"utf-8"},
"input": [["Characters", "a<b>c&d"]],
"expected": ["a&lt;b&gt;c&amp;d"]
},
{"description": "rcdata",
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a<b>c&d"]
},
{"description": "doctype",
"input": [["Doctype", "HTML"]],
"expected": ["<!DOCTYPE HTML>"]
}
]}

View file

@ -0,0 +1,65 @@
{"tests": [
{"description": "no encoding",
"options": {"inject_meta_charset": true},
"input": [["EmptyTag", "head", {}]],
"expected": ["<head>"]
},
{"description": "empytag head",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["EmptyTag", "head", {}]],
"expected": ["<head><meta charset=utf-8>"],
"xhtml": ["<head><meta charset=\"utf-8\" /></head>"]
},
{"description": "head w/title",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["StartTag","title",{}], ["Characters", "foo"],["EndTag", "title"], ["EndTag", "head"]],
"expected": ["<head><meta charset=utf-8><title>foo</title>"],
"xhtml": ["<head><meta charset=\"utf-8\" /><title>foo</title></head>"]
},
{"description": "head w/meta-charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"charset":"ascii"}], ["EndTag", "head"]],
"expected": ["<head><meta charset=utf-8>"],
"xhtml": ["<head><meta charset=\"utf-8\" /></head>"]
},
{"description": "head w/ two meta-charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"charset":"ascii"}], ["EmptyTag","meta",{"charset":"ascii"}], ["EndTag", "head"]],
"expected": ["<head><meta charset=utf-8><meta charset=utf-8>", "<head><meta charset=utf-8><meta charset=ascii>"],
"xhtml": ["<head><meta charset=\"utf-8\" /><meta charset=\"utf-8\" /></head>", "<head><meta charset=\"utf-8\" /><meta charset=\"ascii\" /></head>"]
},
{"description": "head w/robots",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"name":"robots","content":"noindex"}], ["EndTag", "head"]],
"expected": ["<head><meta charset=utf-8><meta content=noindex name=robots>"],
"xhtml": ["<head><meta charset=\"utf-8\" /><meta content=\"noindex\" name=\"robots\" /></head>"]
},
{"description": "head w/robots & charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"name":"robots","content":"noindex"}], ["EmptyTag","meta",{"charset":"ascii"}], ["EndTag", "head"]],
"expected": ["<head><meta content=noindex name=robots><meta charset=utf-8>"],
"xhtml": ["<head><meta content=\"noindex\" name=\"robots\" /><meta charset=\"utf-8\" /></head>"]
},
{"description": "head w/ charset in http-equiv content-type",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"http-equiv":"content-type", "content":"text/html; charset=ascii"}], ["EndTag", "head"]],
"expected": ["<head><meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"],
"xhtml": ["<head><meta content=\"text/html; charset=utf-8\" http-equiv=\"content-type\" /></head>"]
},
{"description": "head w/robots & charset in http-equiv content-type",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "head", {}], ["EmptyTag","meta",{"name":"robots","content":"noindex"}], ["EmptyTag","meta",{"http-equiv":"content-type", "content":"text/html; charset=ascii"}], ["EndTag", "head"]],
"expected": ["<head><meta content=noindex name=robots><meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"],
"xhtml": ["<head><meta content=\"noindex\" name=\"robots\" /><meta content=\"text/html; charset=utf-8\" http-equiv=\"content-type\" /></head>"]
}
]}

View file

@ -0,0 +1,900 @@
{"tests": [
{"description": "html start-tag followed by text, with attributes",
"input": [["StartTag", "html", {"lang": "en"}], ["Characters", "foo"]],
"expected": ["<html lang=en>foo"]
},
{"description": "html start-tag followed by comment",
"input": [["StartTag", "html", {}], ["Comment", "foo"]],
"expected": ["<html><!--foo-->"]
},
{"description": "html start-tag followed by space character",
"input": [["StartTag", "html", {}], ["Characters", " foo"]],
"expected": ["<html> foo"]
},
{"description": "html start-tag followed by text",
"input": [["StartTag", "html", {}], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "html start-tag followed by start-tag",
"input": [["StartTag", "html", {}], ["StartTag", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "html start-tag followed by end-tag",
"input": [["StartTag", "html", {}], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "html start-tag at EOF (shouldn't ever happen?!)",
"input": [["StartTag", "html", {}]],
"expected": [""]
},
{"description": "html end-tag followed by comment",
"input": [["EndTag", "html"], ["Comment", "foo"]],
"expected": ["</html><!--foo-->"]
},
{"description": "html end-tag followed by space character",
"input": [["EndTag", "html"], ["Characters", " foo"]],
"expected": ["</html> foo"]
},
{"description": "html end-tag followed by text",
"input": [["EndTag", "html"], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "html end-tag followed by start-tag",
"input": [["EndTag", "html"], ["StartTag", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "html end-tag followed by end-tag",
"input": [["EndTag", "html"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "html end-tag at EOF",
"input": [["EndTag", "html"]],
"expected": [""]
},
{"description": "head start-tag followed by comment",
"input": [["StartTag", "head", {}], ["Comment", "foo"]],
"expected": ["<head><!--foo-->"]
},
{"description": "head start-tag followed by space character",
"input": [["StartTag", "head", {}], ["Characters", " foo"]],
"expected": ["<head> foo"]
},
{"description": "head start-tag followed by text",
"input": [["StartTag", "head", {}], ["Characters", "foo"]],
"expected": ["<head>foo"]
},
{"description": "head start-tag followed by start-tag",
"input": [["StartTag", "head", {}], ["StartTag", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "head start-tag followed by end-tag",
"input": [["StartTag", "head", {}], ["EndTag", "foo", {}]],
"expected": ["<head></foo>"]
},
{"description": "head start-tag at EOF (shouldn't ever happen?!)",
"input": [["StartTag", "head", {}]],
"expected": ["<head>"]
},
{"description": "head end-tag followed by comment",
"input": [["EndTag", "head"], ["Comment", "foo"]],
"expected": ["</head><!--foo-->"]
},
{"description": "head end-tag followed by space character",
"input": [["EndTag", "head"], ["Characters", " foo"]],
"expected": ["</head> foo"]
},
{"description": "head end-tag followed by text",
"input": [["EndTag", "head"], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "head end-tag followed by start-tag",
"input": [["EndTag", "head"], ["StartTag", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "head end-tag followed by end-tag",
"input": [["EndTag", "head"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "head end-tag at EOF",
"input": [["EndTag", "head"]],
"expected": [""]
},
{"description": "body start-tag followed by comment",
"input": [["StartTag", "body", {}], ["Comment", "foo"]],
"expected": ["<body><!--foo-->"]
},
{"description": "body start-tag followed by space character",
"input": [["StartTag", "body", {}], ["Characters", " foo"]],
"expected": ["<body> foo"]
},
{"description": "body start-tag followed by text",
"input": [["StartTag", "body", {}], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "body start-tag followed by start-tag",
"input": [["StartTag", "body", {}], ["StartTag", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "body start-tag followed by end-tag",
"input": [["StartTag", "body", {}], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "body start-tag at EOF (shouldn't ever happen?!)",
"input": [["StartTag", "body", {}]],
"expected": [""]
},
{"description": "body end-tag followed by comment",
"input": [["EndTag", "body"], ["Comment", "foo"]],
"expected": ["</body><!--foo-->"]
},
{"description": "body end-tag followed by space character",
"input": [["EndTag", "body"], ["Characters", " foo"]],
"expected": ["</body> foo"]
},
{"description": "body end-tag followed by text",
"input": [["EndTag", "body"], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "body end-tag followed by start-tag",
"input": [["EndTag", "body"], ["StartTag", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "body end-tag followed by end-tag",
"input": [["EndTag", "body"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "body end-tag at EOF",
"input": [["EndTag", "body"]],
"expected": [""]
},
{"description": "li end-tag followed by comment",
"input": [["EndTag", "li"], ["Comment", "foo"]],
"expected": ["</li><!--foo-->"]
},
{"description": "li end-tag followed by space character",
"input": [["EndTag", "li"], ["Characters", " foo"]],
"expected": ["</li> foo"]
},
{"description": "li end-tag followed by text",
"input": [["EndTag", "li"], ["Characters", "foo"]],
"expected": ["</li>foo"]
},
{"description": "li end-tag followed by start-tag",
"input": [["EndTag", "li"], ["StartTag", "foo", {}]],
"expected": ["</li><foo>"]
},
{"description": "li end-tag followed by li start-tag",
"input": [["EndTag", "li"], ["StartTag", "li", {}]],
"expected": ["<li>"]
},
{"description": "li end-tag followed by end-tag",
"input": [["EndTag", "li"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "li end-tag at EOF",
"input": [["EndTag", "li"]],
"expected": [""]
},
{"description": "dt end-tag followed by comment",
"input": [["EndTag", "dt"], ["Comment", "foo"]],
"expected": ["</dt><!--foo-->"]
},
{"description": "dt end-tag followed by space character",
"input": [["EndTag", "dt"], ["Characters", " foo"]],
"expected": ["</dt> foo"]
},
{"description": "dt end-tag followed by text",
"input": [["EndTag", "dt"], ["Characters", "foo"]],
"expected": ["</dt>foo"]
},
{"description": "dt end-tag followed by start-tag",
"input": [["EndTag", "dt"], ["StartTag", "foo", {}]],
"expected": ["</dt><foo>"]
},
{"description": "dt end-tag followed by dt start-tag",
"input": [["EndTag", "dt"], ["StartTag", "dt", {}]],
"expected": ["<dt>"]
},
{"description": "dt end-tag followed by dd start-tag",
"input": [["EndTag", "dt"], ["StartTag", "dd", {}]],
"expected": ["<dd>"]
},
{"description": "dt end-tag followed by end-tag",
"input": [["EndTag", "dt"], ["EndTag", "foo", {}]],
"expected": ["</dt></foo>"]
},
{"description": "dt end-tag at EOF",
"input": [["EndTag", "dt"]],
"expected": ["</dt>"]
},
{"description": "dd end-tag followed by comment",
"input": [["EndTag", "dd"], ["Comment", "foo"]],
"expected": ["</dd><!--foo-->"]
},
{"description": "dd end-tag followed by space character",
"input": [["EndTag", "dd"], ["Characters", " foo"]],
"expected": ["</dd> foo"]
},
{"description": "dd end-tag followed by text",
"input": [["EndTag", "dd"], ["Characters", "foo"]],
"expected": ["</dd>foo"]
},
{"description": "dd end-tag followed by start-tag",
"input": [["EndTag", "dd"], ["StartTag", "foo", {}]],
"expected": ["</dd><foo>"]
},
{"description": "dd end-tag followed by dd start-tag",
"input": [["EndTag", "dd"], ["StartTag", "dd", {}]],
"expected": ["<dd>"]
},
{"description": "dd end-tag followed by dt start-tag",
"input": [["EndTag", "dd"], ["StartTag", "dt", {}]],
"expected": ["<dt>"]
},
{"description": "dd end-tag followed by end-tag",
"input": [["EndTag", "dd"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "dd end-tag at EOF",
"input": [["EndTag", "dd"]],
"expected": [""]
},
{"description": "p end-tag followed by comment",
"input": [["EndTag", "p"], ["Comment", "foo"]],
"expected": ["</p><!--foo-->"]
},
{"description": "p end-tag followed by space character",
"input": [["EndTag", "p"], ["Characters", " foo"]],
"expected": ["</p> foo"]
},
{"description": "p end-tag followed by text",
"input": [["EndTag", "p"], ["Characters", "foo"]],
"expected": ["</p>foo"]
},
{"description": "p end-tag followed by start-tag",
"input": [["EndTag", "p"], ["StartTag", "foo", {}]],
"expected": ["</p><foo>"]
},
{"description": "p end-tag followed by address start-tag",
"input": [["EndTag", "p"], ["StartTag", "address", {}]],
"expected": ["<address>"]
},
{"description": "p end-tag followed by blockquote start-tag",
"input": [["EndTag", "p"], ["StartTag", "blockquote", {}]],
"expected": ["<blockquote>"]
},
{"description": "p end-tag followed by dl start-tag",
"input": [["EndTag", "p"], ["StartTag", "dl", {}]],
"expected": ["<dl>"]
},
{"description": "p end-tag followed by fieldset start-tag",
"input": [["EndTag", "p"], ["StartTag", "fieldset", {}]],
"expected": ["<fieldset>"]
},
{"description": "p end-tag followed by form start-tag",
"input": [["EndTag", "p"], ["StartTag", "form", {}]],
"expected": ["<form>"]
},
{"description": "p end-tag followed by h1 start-tag",
"input": [["EndTag", "p"], ["StartTag", "h1", {}]],
"expected": ["<h1>"]
},
{"description": "p end-tag followed by h2 start-tag",
"input": [["EndTag", "p"], ["StartTag", "h2", {}]],
"expected": ["<h2>"]
},
{"description": "p end-tag followed by h3 start-tag",
"input": [["EndTag", "p"], ["StartTag", "h3", {}]],
"expected": ["<h3>"]
},
{"description": "p end-tag followed by h4 start-tag",
"input": [["EndTag", "p"], ["StartTag", "h4", {}]],
"expected": ["<h4>"]
},
{"description": "p end-tag followed by h5 start-tag",
"input": [["EndTag", "p"], ["StartTag", "h5", {}]],
"expected": ["<h5>"]
},
{"description": "p end-tag followed by h6 start-tag",
"input": [["EndTag", "p"], ["StartTag", "h6", {}]],
"expected": ["<h6>"]
},
{"description": "p end-tag followed by hr start-tag",
"input": [["EndTag", "p"], ["StartTag", "hr", {}]],
"expected": ["<hr>"]
},
{"description": "p end-tag followed by menu start-tag",
"input": [["EndTag", "p"], ["StartTag", "menu", {}]],
"expected": ["<menu>"]
},
{"description": "p end-tag followed by ol start-tag",
"input": [["EndTag", "p"], ["StartTag", "ol", {}]],
"expected": ["<ol>"]
},
{"description": "p end-tag followed by p start-tag",
"input": [["EndTag", "p"], ["StartTag", "p", {}]],
"expected": ["<p>"]
},
{"description": "p end-tag followed by pre start-tag",
"input": [["EndTag", "p"], ["StartTag", "pre", {}]],
"expected": ["<pre>"]
},
{"description": "p end-tag followed by table start-tag",
"input": [["EndTag", "p"], ["StartTag", "table", {}]],
"expected": ["<table>"]
},
{"description": "p end-tag followed by ul start-tag",
"input": [["EndTag", "p"], ["StartTag", "ul", {}]],
"expected": ["<ul>"]
},
{"description": "p end-tag followed by end-tag",
"input": [["EndTag", "p"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "p end-tag at EOF",
"input": [["EndTag", "p"]],
"expected": [""]
},
{"description": "optgroup end-tag followed by comment",
"input": [["EndTag", "optgroup"], ["Comment", "foo"]],
"expected": ["</optgroup><!--foo-->"]
},
{"description": "optgroup end-tag followed by space character",
"input": [["EndTag", "optgroup"], ["Characters", " foo"]],
"expected": ["</optgroup> foo"]
},
{"description": "optgroup end-tag followed by text",
"input": [["EndTag", "optgroup"], ["Characters", "foo"]],
"expected": ["</optgroup>foo"]
},
{"description": "optgroup end-tag followed by start-tag",
"input": [["EndTag", "optgroup"], ["StartTag", "foo", {}]],
"expected": ["</optgroup><foo>"]
},
{"description": "optgroup end-tag followed by optgroup start-tag",
"input": [["EndTag", "optgroup"], ["StartTag", "optgroup", {}]],
"expected": ["<optgroup>"]
},
{"description": "optgroup end-tag followed by end-tag",
"input": [["EndTag", "optgroup"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "optgroup end-tag at EOF",
"input": [["EndTag", "optgroup"]],
"expected": [""]
},
{"description": "option end-tag followed by comment",
"input": [["EndTag", "option"], ["Comment", "foo"]],
"expected": ["</option><!--foo-->"]
},
{"description": "option end-tag followed by space character",
"input": [["EndTag", "option"], ["Characters", " foo"]],
"expected": ["</option> foo"]
},
{"description": "option end-tag followed by text",
"input": [["EndTag", "option"], ["Characters", "foo"]],
"expected": ["</option>foo"]
},
{"description": "option end-tag followed by start-tag",
"input": [["EndTag", "option"], ["StartTag", "foo", {}]],
"expected": ["</option><foo>"]
},
{"description": "option end-tag followed by option start-tag",
"input": [["EndTag", "option"], ["StartTag", "option", {}]],
"expected": ["<option>"]
},
{"description": "option end-tag followed by end-tag",
"input": [["EndTag", "option"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "option end-tag at EOF",
"input": [["EndTag", "option"]],
"expected": [""]
},
{"description": "colgroup start-tag followed by comment",
"input": [["StartTag", "colgroup", {}], ["Comment", "foo"]],
"expected": ["<colgroup><!--foo-->"]
},
{"description": "colgroup start-tag followed by space character",
"input": [["StartTag", "colgroup", {}], ["Characters", " foo"]],
"expected": ["<colgroup> foo"]
},
{"description": "colgroup start-tag followed by text",
"input": [["StartTag", "colgroup", {}], ["Characters", "foo"]],
"expected": ["<colgroup>foo"]
},
{"description": "colgroup start-tag followed by start-tag",
"input": [["StartTag", "colgroup", {}], ["StartTag", "foo", {}]],
"expected": ["<colgroup><foo>"]
},
{"description": "first colgroup in a table with a col child",
"input": [["StartTag", "table", {}], ["StartTag", "colgroup", {}], ["StartTag", "col", {}]],
"expected": ["<table><col>"]
},
{"description": "colgroup with a col child, following another colgroup",
"input": [["EndTag", "colgroup", {}], ["StartTag", "colgroup", {}], ["StartTag", "col", {}]],
"expected": ["</colgroup><col>", "<colgroup><col>"]
},
{"description": "colgroup start-tag followed by end-tag",
"input": [["StartTag", "colgroup", {}], ["EndTag", "foo", {}]],
"expected": ["<colgroup></foo>"]
},
{"description": "colgroup start-tag at EOF",
"input": [["StartTag", "colgroup", {}]],
"expected": ["<colgroup>"]
},
{"description": "colgroup end-tag followed by comment",
"input": [["EndTag", "colgroup"], ["Comment", "foo"]],
"expected": ["</colgroup><!--foo-->"]
},
{"description": "colgroup end-tag followed by space character",
"input": [["EndTag", "colgroup"], ["Characters", " foo"]],
"expected": ["</colgroup> foo"]
},
{"description": "colgroup end-tag followed by text",
"input": [["EndTag", "colgroup"], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "colgroup end-tag followed by start-tag",
"input": [["EndTag", "colgroup"], ["StartTag", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "colgroup end-tag followed by end-tag",
"input": [["EndTag", "colgroup"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "colgroup end-tag at EOF",
"input": [["EndTag", "colgroup"]],
"expected": [""]
},
{"description": "thead end-tag followed by comment",
"input": [["EndTag", "thead"], ["Comment", "foo"]],
"expected": ["</thead><!--foo-->"]
},
{"description": "thead end-tag followed by space character",
"input": [["EndTag", "thead"], ["Characters", " foo"]],
"expected": ["</thead> foo"]
},
{"description": "thead end-tag followed by text",
"input": [["EndTag", "thead"], ["Characters", "foo"]],
"expected": ["</thead>foo"]
},
{"description": "thead end-tag followed by start-tag",
"input": [["EndTag", "thead"], ["StartTag", "foo", {}]],
"expected": ["</thead><foo>"]
},
{"description": "thead end-tag followed by tbody start-tag",
"input": [["EndTag", "thead"], ["StartTag", "tbody", {}]],
"expected": ["<tbody>"]
},
{"description": "thead end-tag followed by tfoot start-tag",
"input": [["EndTag", "thead"], ["StartTag", "tfoot", {}]],
"expected": ["<tfoot>"]
},
{"description": "thead end-tag followed by end-tag",
"input": [["EndTag", "thead"], ["EndTag", "foo", {}]],
"expected": ["</thead></foo>"]
},
{"description": "thead end-tag at EOF",
"input": [["EndTag", "thead"]],
"expected": ["</thead>"]
},
{"description": "tbody start-tag followed by comment",
"input": [["StartTag", "tbody", {}], ["Comment", "foo"]],
"expected": ["<tbody><!--foo-->"]
},
{"description": "tbody start-tag followed by space character",
"input": [["StartTag", "tbody", {}], ["Characters", " foo"]],
"expected": ["<tbody> foo"]
},
{"description": "tbody start-tag followed by text",
"input": [["StartTag", "tbody", {}], ["Characters", "foo"]],
"expected": ["<tbody>foo"]
},
{"description": "tbody start-tag followed by start-tag",
"input": [["StartTag", "tbody", {}], ["StartTag", "foo", {}]],
"expected": ["<tbody><foo>"]
},
{"description": "first tbody in a table with a tr child",
"input": [["StartTag", "table", {}], ["StartTag", "tbody", {}], ["StartTag", "tr", {}]],
"expected": ["<table><tr>"]
},
{"description": "tbody with a tr child, following another tbody",
"input": [["EndTag", "tbody", {}], ["StartTag", "tbody", {}], ["StartTag", "tr", {}]],
"expected": ["<tbody><tr>", "</tbody><tr>"]
},
{"description": "tbody with a tr child, following a thead",
"input": [["EndTag", "thead", {}], ["StartTag", "tbody", {}], ["StartTag", "tr", {}]],
"expected": ["<tbody><tr>", "</thead><tr>"]
},
{"description": "tbody with a tr child, following a tfoot",
"input": [["EndTag", "tfoot", {}], ["StartTag", "tbody", {}], ["StartTag", "tr", {}]],
"expected": ["<tbody><tr>", "</tfoot><tr>"]
},
{"description": "tbody start-tag followed by end-tag",
"input": [["StartTag", "tbody", {}], ["EndTag", "foo", {}]],
"expected": ["<tbody></foo>"]
},
{"description": "tbody start-tag at EOF",
"input": [["StartTag", "tbody", {}]],
"expected": ["<tbody>"]
},
{"description": "tbody end-tag followed by comment",
"input": [["EndTag", "tbody"], ["Comment", "foo"]],
"expected": ["</tbody><!--foo-->"]
},
{"description": "tbody end-tag followed by space character",
"input": [["EndTag", "tbody"], ["Characters", " foo"]],
"expected": ["</tbody> foo"]
},
{"description": "tbody end-tag followed by text",
"input": [["EndTag", "tbody"], ["Characters", "foo"]],
"expected": ["</tbody>foo"]
},
{"description": "tbody end-tag followed by start-tag",
"input": [["EndTag", "tbody"], ["StartTag", "foo", {}]],
"expected": ["</tbody><foo>"]
},
{"description": "tbody end-tag followed by tbody start-tag",
"input": [["EndTag", "tbody"], ["StartTag", "tbody", {}]],
"expected": ["<tbody>", "</tbody>"]
},
{"description": "tbody end-tag followed by tfoot start-tag",
"input": [["EndTag", "tbody"], ["StartTag", "tfoot", {}]],
"expected": ["<tfoot>"]
},
{"description": "tbody end-tag followed by end-tag",
"input": [["EndTag", "tbody"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "tbody end-tag at EOF",
"input": [["EndTag", "tbody"]],
"expected": [""]
},
{"description": "tfoot end-tag followed by comment",
"input": [["EndTag", "tfoot"], ["Comment", "foo"]],
"expected": ["</tfoot><!--foo-->"]
},
{"description": "tfoot end-tag followed by space character",
"input": [["EndTag", "tfoot"], ["Characters", " foo"]],
"expected": ["</tfoot> foo"]
},
{"description": "tfoot end-tag followed by text",
"input": [["EndTag", "tfoot"], ["Characters", "foo"]],
"expected": ["</tfoot>foo"]
},
{"description": "tfoot end-tag followed by start-tag",
"input": [["EndTag", "tfoot"], ["StartTag", "foo", {}]],
"expected": ["</tfoot><foo>"]
},
{"description": "tfoot end-tag followed by tbody start-tag",
"input": [["EndTag", "tfoot"], ["StartTag", "tbody", {}]],
"expected": ["<tbody>", "</tfoot>"]
},
{"description": "tfoot end-tag followed by end-tag",
"input": [["EndTag", "tfoot"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "tfoot end-tag at EOF",
"input": [["EndTag", "tfoot"]],
"expected": [""]
},
{"description": "tr end-tag followed by comment",
"input": [["EndTag", "tr"], ["Comment", "foo"]],
"expected": ["</tr><!--foo-->"]
},
{"description": "tr end-tag followed by space character",
"input": [["EndTag", "tr"], ["Characters", " foo"]],
"expected": ["</tr> foo"]
},
{"description": "tr end-tag followed by text",
"input": [["EndTag", "tr"], ["Characters", "foo"]],
"expected": ["</tr>foo"]
},
{"description": "tr end-tag followed by start-tag",
"input": [["EndTag", "tr"], ["StartTag", "foo", {}]],
"expected": ["</tr><foo>"]
},
{"description": "tr end-tag followed by tr start-tag",
"input": [["EndTag", "tr"], ["StartTag", "tr", {}]],
"expected": ["<tr>", "</tr>"]
},
{"description": "tr end-tag followed by end-tag",
"input": [["EndTag", "tr"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "tr end-tag at EOF",
"input": [["EndTag", "tr"]],
"expected": [""]
},
{"description": "td end-tag followed by comment",
"input": [["EndTag", "td"], ["Comment", "foo"]],
"expected": ["</td><!--foo-->"]
},
{"description": "td end-tag followed by space character",
"input": [["EndTag", "td"], ["Characters", " foo"]],
"expected": ["</td> foo"]
},
{"description": "td end-tag followed by text",
"input": [["EndTag", "td"], ["Characters", "foo"]],
"expected": ["</td>foo"]
},
{"description": "td end-tag followed by start-tag",
"input": [["EndTag", "td"], ["StartTag", "foo", {}]],
"expected": ["</td><foo>"]
},
{"description": "td end-tag followed by td start-tag",
"input": [["EndTag", "td"], ["StartTag", "td", {}]],
"expected": ["<td>", "</td>"]
},
{"description": "td end-tag followed by th start-tag",
"input": [["EndTag", "td"], ["StartTag", "th", {}]],
"expected": ["<th>", "</td>"]
},
{"description": "td end-tag followed by end-tag",
"input": [["EndTag", "td"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "td end-tag at EOF",
"input": [["EndTag", "td"]],
"expected": [""]
},
{"description": "th end-tag followed by comment",
"input": [["EndTag", "th"], ["Comment", "foo"]],
"expected": ["</th><!--foo-->"]
},
{"description": "th end-tag followed by space character",
"input": [["EndTag", "th"], ["Characters", " foo"]],
"expected": ["</th> foo"]
},
{"description": "th end-tag followed by text",
"input": [["EndTag", "th"], ["Characters", "foo"]],
"expected": ["</th>foo"]
},
{"description": "th end-tag followed by start-tag",
"input": [["EndTag", "th"], ["StartTag", "foo", {}]],
"expected": ["</th><foo>"]
},
{"description": "th end-tag followed by th start-tag",
"input": [["EndTag", "th"], ["StartTag", "th", {}]],
"expected": ["<th>", "</th>"]
},
{"description": "th end-tag followed by td start-tag",
"input": [["EndTag", "th"], ["StartTag", "td", {}]],
"expected": ["<td>", "</th>"]
},
{"description": "th end-tag followed by end-tag",
"input": [["EndTag", "th"], ["EndTag", "foo", {}]],
"expected": ["</foo>"]
},
{"description": "th end-tag at EOF",
"input": [["EndTag", "th"]],
"expected": [""]
}
]}

View file

@ -0,0 +1,54 @@
{"tests":[
{"description": "quote_char=\"'\"",
"options": {"quote_char": "'"},
"input": [["StartTag", "span", {"title": "test 'with' quote_char"}]],
"expected": ["<span title='test &#39;with&#39; quote_char'>"]
},
{"description": "quote_attr_values=true",
"options": {"quote_attr_values": true},
"input": [["StartTag", "button", {"disabled": "disabled"}]],
"expected": ["<button disabled>"],
"xhtml": ["<button disabled=\"disabled\">"]
},
{"description": "quote_attr_values=true with irrelevant",
"options": {"quote_attr_values": true},
"input": [["StartTag", "div", {"irrelevant": "irrelevant"}]],
"expected": ["<div irrelevant>"],
"xhtml": ["<div irrelevant=\"irrelevant\">"]
},
{"description": "use_trailing_solidus=true with void element",
"options": {"use_trailing_solidus": true},
"input": [["EmptyTag", "img", {}]],
"expected": ["<img />"]
},
{"description": "use_trailing_solidus=true with non-void element",
"options": {"use_trailing_solidus": true},
"input": [["StartTag", "div", {}]],
"expected": ["<div>"]
},
{"description": "minimize_boolean_attributes=false",
"options": {"minimize_boolean_attributes": false},
"input": [["StartTag", "div", {"irrelevant": "irrelevant"}]],
"expected": ["<div irrelevant=irrelevant>"],
"xhtml": ["<div irrelevant=\"irrelevant\">"]
},
{"description": "minimize_boolean_attributes=false with empty value",
"options": {"minimize_boolean_attributes": false},
"input": [["StartTag", "div", {"irrelevant": ""}]],
"expected": ["<div irrelevant=\"\">"]
},
{"description": "escape less than signs in attribute values",
"options": {"escape_lt_in_attrs": true},
"input": [["StartTag", "a", {"title": "a<b>c&d"}]],
"expected": ["<a title=\"a&lt;b>c&amp;d\">"]
}
]}

View file

@ -0,0 +1,51 @@
{"tests": [
{"description": "bare text with leading spaces",
"options": {"strip_whitespace": true},
"input": [["Characters", "\t\r\n\u000B\u000C foo"]],
"expected": ["foo"]
},
{"description": "bare text with trailing spaces",
"options": {"strip_whitespace": true},
"input": [["Characters", "foo \t\r\n\u000B\u000C"]],
"expected": ["foo"]
},
{"description": "bare text with inner spaces",
"options": {"strip_whitespace": true},
"input": [["Characters", "foo \t\r\n\u000B\u000C bar"]],
"expected": ["foo bar"]
},
{"description": "text within <pre>",
"options": {"strip_whitespace": true},
"input": [["StartTag", "pre", {}], ["Characters", "\t\r\n\u000B\u000C foo \t\r\n\u000B\u000C bar \t\r\n\u000B\u000C"], ["EndTag", "pre"]],
"expected": ["<pre>\t\r\n\u000B\u000C foo \t\r\n\u000B\u000C bar \t\r\n\u000B\u000C</pre>"]
},
{"description": "text within <pre>, with inner markup",
"options": {"strip_whitespace": true},
"input": [["StartTag", "pre", {}], ["Characters", "\t\r\n\u000B\u000C fo"], ["StartTag", "span", {}], ["Characters", "o \t\r\n\u000B\u000C b"], ["EndTag", "span"], ["Characters", "ar \t\r\n\u000B\u000C"], ["EndTag", "pre"]],
"expected": ["<pre>\t\r\n\u000B\u000C fo<span>o \t\r\n\u000B\u000C b</span>ar \t\r\n\u000B\u000C</pre>"]
},
{"description": "text within <textarea>",
"options": {"strip_whitespace": true},
"input": [["StartTag", "textarea", {}], ["Characters", "\t\r\n\u000B\u000C foo \t\r\n\u000B\u000C bar \t\r\n\u000B\u000C"], ["EndTag", "textarea"]],
"expected": ["<textarea>\t\r\n\u000B\u000C foo \t\r\n\u000B\u000C bar \t\r\n\u000B\u000C</textarea>"]
},
{"description": "text within <script>",
"options": {"strip_whitespace": true},
"input": [["StartTag", "script", {}], ["Characters", "\t\r\n\u000B\u000C foo \t\r\n\u000B\u000C bar \t\r\n\u000B\u000C"], ["EndTag", "script"]],
"expected": ["<script>\t\r\n\u000B\u000C foo \t\r\n\u000B\u000C bar \t\r\n\u000B\u000C</script>"]
},
{"description": "text within <style>",
"options": {"strip_whitespace": true},
"input": [["StartTag", "style", {}], ["Characters", "\t\r\n\u000B\u000C foo \t\r\n\u000B\u000C bar \t\r\n\u000B\u000C"], ["EndTag", "style"]],
"expected": ["<style>\t\r\n\u000B\u000C foo \t\r\n\u000B\u000C bar \t\r\n\u000B\u000C</style>"]
}
]}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,36 @@
{"tests": [
{"description":"PLAINTEXT content model flag",
"contentModelFlags":["PLAINTEXT"],
"input":"<head>&body;",
"output":[["Character", "<head>&body;"]]},
{"description":"End tag closing RCDATA or CDATA",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"bar",
"input":"foo</bar>",
"output":[["Character", "foo"], ["EndTag", "bar"]]},
{"description":"End tag with incorrect name in RCDATA or CDATA",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"baz",
"input":"</foo>bar</baz>",
"output":[["Character", "</foo>bar"], ["EndTag", "baz"]]},
{"description":"End tag closing RCDATA or CDATA, switching back to PCDATA",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"bar",
"input":"foo</bar></baz>",
"output":[["Character", "foo"], ["EndTag", "bar"], ["EndTag", "baz"]]},
{"description":"CDATA w/ something looking like an entity",
"contentModelFlags":["CDATA"],
"input":"&foo;",
"output":[["Character", "&foo;"]]},
{"description":"RCDATA w/ an entity",
"contentModelFlags":["RCDATA"],
"input":"&lt;",
"output":[["Character", "<"]]}
]}

View file

@ -0,0 +1,21 @@
{"tests": [
{"description":"Commented close tag in [R]CDATA",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"bar",
"input":"foo<!--</bar>--></bar>",
"output":[["Character", "foo<!--</bar>-->"], ["EndTag", "bar"]]},
{"description":"Bogus comment in [R]CDATA",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"bar",
"input":"foo<!-->baz</bar>",
"output":[["Character", "foo<!-->baz"], ["EndTag", "bar"]]},
{"description":"End tag surrounded by bogus comment in [R]CDATA",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"bar",
"input":"foo<!--></bar><!-->baz</bar>",
"output":[["Character", "foo<!-->"], ["EndTag", "bar"], "ParseError", ["Comment", ""], ["Character", "baz"], ["EndTag", "bar"]]}
]}

View file

@ -0,0 +1,156 @@
{"tests": [
{"description":"Correct Doctype lowercase",
"input":"<!DOCTYPE html>",
"output":[["DOCTYPE", "html", null, null, true]]},
{"description":"Correct Doctype uppercase",
"input":"<!DOCTYPE HTML>",
"output":[["DOCTYPE", "HTML", null, null, true]]},
{"description":"Correct Doctype mixed case",
"input":"<!DOCTYPE HtMl>",
"output":[["DOCTYPE", "HtMl", null, null, true]]},
{"description":"Truncated doctype start",
"input":"<!DOC>",
"output":["ParseError", ["Comment", "DOC"]]},
{"description":"Doctype in error",
"input":"<!DOCTYPE foo>",
"output":[["DOCTYPE", "foo", null, null, true]]},
{"description":"Single Start Tag",
"input":"<h>",
"output":[["StartTag", "h", {}]]},
{"description":"Empty end tag",
"input":"</>",
"output":["ParseError"]},
{"description":"Empty start tag",
"input":"<>",
"output":["ParseError", ["Character", "<>"]]},
{"description":"Start Tag w/attribute",
"input":"<h a='b'>",
"output":[["StartTag", "h", {"a":"b"}]]},
{"description":"Start Tag w/attribute no quotes",
"input":"<h a=b>",
"output":[["StartTag", "h", {"a":"b"}]]},
{"description":"Start/End Tag",
"input":"<h></h>",
"output":[["StartTag", "h", {}], ["EndTag", "h"]]},
{"description":"Two unclosed start tags",
"input":"<p>One<p>Two",
"output":[["StartTag", "p", {}], ["Character", "One"], ["StartTag", "p", {}], ["Character", "Two"]]},
{"description":"End Tag w/attribute",
"input":"<h></h a='b'>",
"output":[["StartTag", "h", {}], "ParseError", ["EndTag", "h"]]},
{"description":"Multiple atts",
"input":"<h a='b' c='d'>",
"output":[["StartTag", "h", {"a":"b", "c":"d"}]]},
{"description":"Multiple atts no space",
"input":"<h a='b'c='d'>",
"output":[["StartTag", "h", {"a":"b", "c":"d"}]]},
{"description":"Repeated attr",
"input":"<h a='b' a='d'>",
"output":["ParseError", ["StartTag", "h", {"a":"b"}]]},
{"description":"Simple comment",
"input":"<!--comment-->",
"output":[["Comment", "comment"]]},
{"description":"Comment, Central dash no space",
"input":"<!----->",
"output":["ParseError", ["Comment", "-"]]},
{"description":"Comment, two central dashes",
"input":"<!-- --comment -->",
"output":["ParseError", ["Comment", " --comment "]]},
{"description":"Unfinished comment",
"input":"<!--comment",
"output":["ParseError", ["Comment", "comment"]]},
{"description":"Start of a comment",
"input":"<!-",
"output":["ParseError", ["Comment", "-"]]},
{"description":"Short comment",
"input":"<!-->",
"output":["ParseError", ["Comment", ""]]},
{"description":"Short comment two",
"input":"<!--->",
"output":["ParseError", ["Comment", ""]]},
{"description":"Short comment three",
"input":"<!---->",
"output":[["Comment", ""]]},
{"description":"Ampersand EOF",
"input":"&",
"output":[["Character", "&"]]},
{"description":"Ampersand ampersand EOF",
"input":"&&",
"output":[["Character", "&&"]]},
{"description":"Ampersand space EOF",
"input":"& ",
"output":[["Character", "& "]]},
{"description":"Unfinished entity",
"input":"&f",
"output":["ParseError", ["Character", "&f"]]},
{"description":"Ampersand, number sign",
"input":"&#",
"output":["ParseError", ["Character", "&#"]]},
{"description":"Unfinished numeric entity",
"input":"&#x",
"output":["ParseError", ["Character", "&#x"]]},
{"description":"Entity with trailing semicolon (1)",
"input":"I'm &not;it",
"output":[["Character","I'm ¬it"]]},
{"description":"Entity with trailing semicolon (2)",
"input":"I'm &notin;",
"output":[["Character","I'm ∉"]]},
{"description":"Entity without trailing semicolon (1)",
"input":"I'm &notit",
"output":[["Character","I'm "], "ParseError", ["Character", "¬it"]]},
{"description":"Entity without trailing semicolon (2)",
"input":"I'm &notin",
"output":[["Character","I'm "], "ParseError", ["Character", "∉"]]},
{"description":"Partial entity match at end of file",
"input":"I'm &no",
"output":[["Character","I'm "], "ParseError", ["Character", "&no"]]},
{"description":"ASCII decimal entity",
"input":"&#0036;",
"output":[["Character","$"]]},
{"description":"ASCII hexadecimal entity",
"input":"&#x3f;",
"output":[["Character","?"]]},
{"description":"Hexadecimal entity in attribute",
"input":"<h a='&#x3f;'></h>",
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]}
]}

View file

@ -0,0 +1,125 @@
{"tests": [
{"description":"DOCTYPE without name",
"input":"<!DOCTYPE>",
"output":["ParseError", "ParseError", ["DOCTYPE", "", null, null, false]]},
{"description":"DOCTYPE without space before name",
"input":"<!DOCTYPEhtml>",
"output":["ParseError", ["DOCTYPE", "html", null, null, true]]},
{"description":"Incorrect DOCTYPE without a space before name",
"input":"<!DOCTYPEfoo>",
"output":["ParseError", ["DOCTYPE", "foo", null, null, true]]},
{"description":"DOCTYPE with publicId",
"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\">",
"output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", null, true]]},
{"description":"DOCTYPE with EOF after PUBLIC",
"input":"<!DOCTYPE html PUBLIC",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"DOCTYPE with EOF after PUBLIC '",
"input":"<!DOCTYPE html PUBLIC '",
"output":["ParseError", ["DOCTYPE", "html", "", null, false]]},
{"description":"DOCTYPE with EOF after PUBLIC 'x",
"input":"<!DOCTYPE html PUBLIC 'x",
"output":["ParseError", ["DOCTYPE", "html", "x", null, false]]},
{"description":"DOCTYPE with systemId",
"input":"<!DOCTYPE html SYSTEM \"-//W3C//DTD HTML Transitional 4.01//EN\">",
"output":[["DOCTYPE", "html", null, "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
{"description":"DOCTYPE with publicId and systemId",
"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\" \"-//W3C//DTD HTML Transitional 4.01//EN\">",
"output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
{"description":"Incomplete doctype",
"input":"<!DOCTYPE html ",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Numeric entity representing the NUL character",
"input":"&#0000;",
"output":[["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing the NUL character",
"input":"&#x0000;",
"output":[["Character", "\uFFFD"]]},
{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#2225222;",
"output":[["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#x1010FFFF;",
"output":[["Character", "\uFFFD"]]},
{"description":"Numeric entity representing a Windows-1252 'codepoint'",
"input":"&#137;",
"output":["ParseError", ["Character", "\u2030"]]},
{"description":"Hexadecimal entity representing a Windows-1252 'codepoint'",
"input":"&#x89;",
"output":["ParseError", ["Character", "\u2030"]]},
{"description":"Hexadecimal entity with mixed uppercase and lowercase",
"input":"&#xaBcD;",
"output":[["Character", "\uABCD"]]},
{"description":"Entity without a name",
"input":"&;",
"output":["ParseError", ["Character", "&;"]]},
{"description":"Unescaped ampersand in attribute value",
"input":"<h a='&'>",
"output":["ParseError", ["StartTag", "h", { "a":"&" }]]},
{"description":"StartTag containing <",
"input":"<a<b>",
"output":[["StartTag", "a<b", { }]]},
{"description":"Non-void element containing trailing /",
"input":"<h/>",
"output":["ParseError", ["StartTag", "h", { }]]},
{"description":"Void element with permitted slash",
"input":"<br/>",
"output":[["StartTag", "br", { }]]},
{"description":"StartTag containing /",
"input":"<h/a='b'>",
"output":["ParseError", ["StartTag", "h", { "a":"b" }]]},
{"description":"Double-quoted attribute value",
"input":"<h a=\"b\">",
"output":[["StartTag", "h", { "a":"b" }]]},
{"description":"Unescaped </",
"input":"</",
"output":["ParseError", ["Character", "</"]]},
{"description":"Illegal end tag name",
"input":"</1>",
"output":["ParseError", ["Comment", "1"]]},
{"description":"Simili processing instruction",
"input":"<?namespace>",
"output":["ParseError", ["Comment", "?namespace"]]},
{"description":"A bogus comment stops at >, even if preceeded by two dashes",
"input":"<?foo-->",
"output":["ParseError", ["Comment", "?foo--"]]},
{"description":"Unescaped <",
"input":"foo < bar",
"output":[["Character", "foo "], "ParseError", ["Character", "< bar"]]},
{"description":"Null Byte Replacement",
"input":"\u0000",
"output":[["Character", "\ufffd"]]}
]}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,779 @@
#data
<!DOCTYPE HTML>Test
#errors
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| "Test"
#data
<textarea>test</div>test
#errors
10: missing document type declaration.
17: unescaped '</' in CDATA or RCDATA block.
25: unexpected end of file while parsing CDATA section for element textarea.
#document
| <html>
| <head>
| <body>
| <textarea>
| "test</div>test"
#data
<table><td>
#errors
7: missing document type declaration.
11: required tr element start tag implied by unexpected td element start tag.
12: unexpected end of file implied table element end tag.
#document
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
#data
<table><td>test</tbody></table>
#errors
missing document type declarattion
Unexpected and of file
#document
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>
| "test"
#data
<frame>test
#errors
missing document type declaration
frame element can't occur here
#document
| <html>
| <head>
| <body>
| "test"
#data
<!DOCTYPE HTML><frameset>test
#errors
frameset can't contain text
Unexpected end of file
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <frameset>
#data
<!DOCTYPE HTML><frameset><!DOCTYPE HTML>
#errors
document type declaration can only occur at the start of a document
Expected end tag </frameset>
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <frameset>
#data
<!DOCTYPE HTML><font><p><b>test</font>
#errors
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| <font>
| <p>
| <font>
| <b>
| "test"
#data
<!DOCTYPE HTML><dt><div><dd>
#errors
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| <dt>
| <div>
| <dd>
#data
<script></x
#errors
no document type
</ in script
Unexpected end of file. Expected </script> end tag.
#document
| <html>
| <head>
| <script>
| "</x"
| <body>
#data
<table><plaintext><td>
#errors
no document type
<plaintext> directly inside table
Characters inside table.
Unexpected end of file.
#document
| <html>
| <head>
| <body>
| <plaintext>
| "<td>"
| <table>
#data
<plaintext></plaintext>
#errors
No DOCTYPE seen.
Unexpected end of file.
#document
| <html>
| <head>
| <body>
| <plaintext>
| "</plaintext>"
#data
<!DOCTYPE HTML><table><tr>TEST
#errors
TEST can't occur in <tr>
Unexpected end of file.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| "TEST"
| <table>
| <tbody>
| <tr>
#data
<!DOCTYPE HTML><body t1=1><body t2=2><body t3=3 t4=4>
#errors
Unexpected start tag "body"
Unexpected start tag "body"
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| t4="4"
| t2="2"
| t3="3"
| t1="1"
#data
</b test
#errors
Unexpected EOF in attribute
Unexpected attribute in end tag.
No doctype.
Unexpected end tag.
#document
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML></b test<b &=&amp>X
#errors
Unexpected < in attribute
End tag contains attributes.
Unexpected end tag.
Named entity didn't end with ;
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| "X"
#data
<!doctypehtml><scrIPt type=text/x-foobar;baz>X</SCRipt
#errors
No space after literal DOCTYPE.
Unexpected EOF in (end) tag name
#document
| <!DOCTYPE html>
| <html>
| <head>
| <script>
| type="text/x-foobar;baz"
| "X"
| <body>
#data
&
#errors
No doctype.
Unfinished entity.
#document
| <html>
| <head>
| <body>
| "&"
#data
&#
#errors
No doctype.
Unfinished numeric entity.
#document
| <html>
| <head>
| <body>
| "&#"
#data
&#X
#errors
No doctype.
Unfinished hexadecimal entity.
#document
| <html>
| <head>
| <body>
| "&#X"
#data
&#x
#errors
No doctype.
Unfinished hexadecimal entity.
#document
| <html>
| <head>
| <body>
| "&#x"
#data
&#45
#errors
No doctype.
Numeric entity didn't end with ;
#document
| <html>
| <head>
| <body>
| "-"
#data
&x-test
#errors
No doctype.
Unfinished named entity.
#document
| <html>
| <head>
| <body>
| "&x-test"
#data
<!doctypehtml><p><li>
#errors
No space after literal DOCTYPE.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| <li>
#data
<!doctypeHTML><p><dt>
#errors
No space after literal DOCTYPE.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| <p>
| <dt>
#data
<!doctypehtmL><p><dd>
#errors
No space after literal DOCTYPE.
#document
| <!DOCTYPE htmL>
| <html>
| <head>
| <body>
| <p>
| <dd>
#data
<!doctypehtml><p><form>
#errors
No space after literal DOCTYPE.
Unexpected EOF.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <p>
| <form>
#data
<!DOCTYPE HTML><p><b><i><u></p> <p>X
#errors
Unexpected end tag </p>.
Unexpected end EOF. Missing closing tags.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| <p>
| <b>
| <i>
| <u>
| " "
| <p>
| <b>
| <i>
| <u>
| "X"
#data
<!DOCTYPE HTML><p></P>X
#errors
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| <p>
| "X"
#data
&AMP
#errors
No doctype.
No closing ; for the entity.
#document
| <html>
| <head>
| <body>
| "&"
#data
&AMp;
#errors
No doctype.
Invalid entity.
#document
| <html>
| <head>
| <body>
| "&AMp;"
#data
<!DOCTYPE HTML><html><head></head><body><thisISasillyTESTelementNameToMakeSureCrazyTagNamesArePARSEDcorrectLY>
#errors
Unexpected end of file.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| <thisisasillytestelementnametomakesurecrazytagnamesareparsedcorrectly>
#data
<!DOCTYPE HTML>X</body>X
#errors
Unexpected non-space characters in the after body phase.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| "XX"
#data
<!DOCTYPE HTML><!-- X
#errors
Unexpected end of file in comment.
#document
| <!DOCTYPE HTML>
| <!-- X -->
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML><table><caption>test TEST</caption><td>test
#errors
Unexpected <td> in table body phase.
Unexpected end of file.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| <table>
| <caption>
| "test TEST"
| <tbody>
| <tr>
| <td>
| "test"
#data
<!DOCTYPE HTML><select><option><optgroup>
#errors
Unexpected end of file. Missing closing tags.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| <select>
| <option>
| <optgroup>
#data
<!DOCTYPE HTML><select><optgroup><option></optgroup><option><select><option>
#errors
Unexpected start tag <select> in <select>.
Unexpected start tag <option>.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| <select>
| <optgroup>
| <option>
| <option>
#data
<!DOCTYPE HTML><select><optgroup><option><optgroup>
#errors
Unexpected end of file. Missing closing tags.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| <select>
| <optgroup>
| <option>
| <optgroup>
#data
<!DOCTYPE HTML><font><input><input></font>
#errors
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| <font>
| <input>
| <input>
#data
<!DOCTYPE HTML><!-- XXX - XXX -->
#errors
#document
| <!DOCTYPE HTML>
| <!-- XXX - XXX -->
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML><!-- XXX - XXX
#errors
Unexpected EOF in comment.
#document
| <!DOCTYPE HTML>
| <!-- XXX - XXX -->
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML><!-- XXX - XXX - XXX -->
#errors
#document
| <!DOCTYPE HTML>
| <!-- XXX - XXX - XXX -->
| <html>
| <head>
| <body>
#data
<isindex test=x name=x>
#errors
No doctype
<isindex> is not ok!
#document
| <html>
| <head>
| <body>
| <form>
| <hr>
| <p>
| <label>
| "This is a searchable index. Insert your search keywords here:"
| <input>
| test="x"
| name="isindex"
| <hr>
#data
test
test
#errors
No doctype
#document
| <html>
| <head>
| <body>
| "test
test"
#data
<p><b><i><u></p>
<p>X
#errors
No doctype
Unexpected end tag p.
Unexpected EOF.
#document
| <html>
| <head>
| <body>
| <p>
| <b>
| <i>
| <u>
| "
"
| <p>
| <b>
| <i>
| <u>
| "X"
#data
<!DOCTYPE HTML><body><title>test</body></title>
#errors
Unexpected start tag that belongs in the head.
Expected closing tag after </.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <title>
| "test</body>"
| <body>
#data
<!DOCTYPE HTML><body><title>X</title><meta name=z><link rel=foo><style>
x { content:"</style" } </style>
#errors
Unexpected start tag that belongs in head.
Unexpected start tag that belongs in head.
Unexpected start tag that belongs in head.
Expected closing tag after </.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <title>
| "X"
| <body>
| <meta>
| name="z"
| <link>
| rel="foo"
| <style>
| "
x { content:"</style" } "
#data
<!DOCTYPE HTML><select><optgroup></optgroup></select>
#errors
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| <select>
| <optgroup>
#data
#errors
No doctype.
#document
| "
"
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML> <html>
#errors
#document
| <!DOCTYPE HTML>
| " "
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML><script>
</script> <title>x</title> </head>
#errors
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <script>
| "
"
| " "
| <title>
| "x"
| " "
| <body>
#data
<!DOCTYPE HTML><html><body><html id=x>
#errors
duplicate html start tag
#document
| <!DOCTYPE HTML>
| <html>
| id="x"
| <head>
| <body>
#data
<!DOCTYPE HTML>X</body><html id="x">
#errors
Unexpected html start tag in the after body phase.
html needs to be the first start tag.
#document
| <!DOCTYPE HTML>
| <html>
| id="x"
| <head>
| <body>
| "X"
#data
<!DOCTYPE HTML><head><html id=x>
#errors
html start tag too late
#document
| <!DOCTYPE HTML>
| <html>
| id="x"
| <head>
| <body>
#data
<!DOCTYPE HTML>X</html>X
#errors
Unexpected non-space characters. Expected end of file.
Unexpected non-space characters in after body phase. Expected end of file.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| "XX"
#data
<!DOCTYPE HTML>X</html>
#errors
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| "X "
#data
<!DOCTYPE HTML>X</html><p>X
#errors
Unexpected start tag <p> in trailing end phase.
Unexpected start tag <p> in after body phase.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| "X"
| <p>
| "X"
#data
<!DOCTYPE HTML>X<p/x/y/z>
#errors
Solidus (/) incorrectly placed.
Solidus (/) incorrectly placed.
Solidus (/) incorrectly placed.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| "X"
| <p>
| y=""
| x=""
| z=""
#data
<!DOCTYPE HTML><!--x--
#errors
Unexpected end of file in comment.
#document
| <!DOCTYPE HTML>
| <!-- x -->
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML><table><tr><td></p></table>
#errors
Unexpected </p> end tag.
#document
| <!DOCTYPE HTML>
| <html>
| <head>
| <body>
| <table>
| <tbody>
| <tr>
| <td>

View file

@ -0,0 +1,236 @@
#data
<head></head><style></style>
#errors
No DOCTYPE
<style> in after-head mode
#document
| <html>
| <head>
| <style>
| <body>
#data
<head></head><script></script>
#errors
No DOCTYPE
<script> in after-head mode
#document
| <html>
| <head>
| <script>
| <body>
#data
<head></head><!-- --><style></style><!-- --><script></script>
#errors
No DOCTYPE
<style> in after-head mode
#document
| <html>
| <head>
| <style>
| <script>
| <!-- -->
| <!-- -->
| <body>
#data
<head></head><!-- -->x<style></style><!-- --><script></script>
#errors
No DOCTYPE
#document
| <html>
| <head>
| <!-- -->
| <body>
| "x"
| <style>
| <!-- -->
| <script>
#data
<!DOCTYPE htML><html><head></head><body><pre>
</pre></body></html>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <pre>
#data
<!DOCTYPE htML><html><head></head><body><pre>
foo</pre></body></html>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <pre>
| "foo"
#data
<!DOCTYPE htML><html><head></head><body><pre>
foo
</pre></body></html>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <pre>
| "foo
"
#data
<!DOCTYPE htML><html><head></head><body><pre>x</pre><span>
</span></body></html>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <pre>
| "x"
| <span>
| "
"
#data
<!DOCTYPE htML><html><head></head><body><pre>x
y</pre></body></html>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <pre>
| "x
y"
#data
<!DOCTYPE htML><html><head></head><body><pre>x<div>
y</pre></body></html>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <pre>
| "x"
| <div>
| "
| y"
#data
<!DOCTYPE htML><HTML><META><HEAD></HEAD></HTML>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <meta>
| <body>
#data
<!DOCTYPE htML><HTML><HEAD><head></HEAD></HTML>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
#data
<textarea>foo<span>bar</span><i>baz
#errors
#document
| <html>
| <head>
| <body>
| <textarea>
| "foo<span>bar</span><i>baz"
#data
<title>foo<span>bar</em><i>baz
#errors
#document
| <html>
| <head>
| <title>
| "foo<span>bar</em><i>baz"
| <body>
#data
<!DOCTYPE htML><textarea>
</textarea>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <textarea>
#data
<!DOCTYPE htML><textarea>
foo</textarea>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <textarea>
| "foo"
#data
<!DOCTYPE htML><html><head></head><body><ul><li><div><p><li></ul></body></html>
#errors
Missing end tag (div)
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <ul>
| <li>
| <div>
| <p>
| <li>
#data
<!doctype html><nobr><nobr><nobr>
#errors
Unexpected end of file.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <nobr>
| <nobr>
| <nobr>
#data
<!doctype html><nobr><nobr></nobr><nobr>
#errors
Unexpected end of file.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <nobr>
| <nobr>
| <nobr>

View file

@ -0,0 +1,44 @@
#data
direct div content
#errors
#document-fragment div
| "direct div content"
#data
direct textarea content
#errors
#document-fragment textarea
| "direct textarea content"
#data
textarea content with <em>pseudo</em> <foo>markup
#errors
#document-fragment textarea
| "textarea content with <em>pseudo</em> <foo>markup"
#data
this is &#x0043;DATA inside a <style> element
#errors
#document-fragment style
| "this is &#x0043;DATA inside a <style> element"
#data
</plaintext>
#errors
#document-fragment plaintext
| "</plaintext>"
#data
setting html's innerHTML
#errors
#document-fragment html
| <head>
| <body>
| "setting html's innerHTML"
#data
<title>setting head's innerHTML</title>
#errors
#document-fragment head
| <title>
| "setting head's innerHTML"

View file

@ -0,0 +1,120 @@
#data
<style> <!-- </style>x
#errors
No DOCTYPE
Unexpected end of file
#document
| <html>
| <head>
| <style>
| " <!-- </style>x"
| <body>
#data
<style> <!-- </style> --> </style>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <style>
| " <!-- </style> --> "
| <body>
| "x"
#data
<style> <!--> </style>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <style>
| " <!--> "
| <body>
| "x"
#data
<style> <!---> </style>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <style>
| " <!---> "
| <body>
| "x"
#data
<iframe> <!---> </iframe>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <body>
| <iframe>
| " <!---> "
| "x"
#data
<iframe> <!--- </iframe>->x</iframe> --> </iframe>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <body>
| <iframe>
| " <!--- </iframe>->x</iframe> --> "
| "x"
#data
<script> <!-- </script> --> </script>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <script>
| " <!-- </script> --> "
| <body>
| "x"
#data
<title> <!-- </title> --> </title>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <title>
| " <!-- </title> --> "
| <body>
| "x"
#data
<textarea> <!--- </textarea>->x</textarea> --> </textarea>x
#errors
No DOCTYPE
#document
| <html>
| <head>
| <body>
| <textarea>
| " <!--- </textarea>->x</textarea> --> "
| "x"
#data
<style> <!</-- </style>x
#errors
No DOCTYPE
Unexpected end of file
#document
| <html>
| <head>
| <style>
| " <!</-- "
| <body>
| "x"

View file

@ -0,0 +1,29 @@
#data
<!doctype html></head> <head>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| " "
| <body>
#data
<!doctype html></html> <head>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| " "
#data
<!doctype html></body><meta>
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <meta>

View file

@ -1,23 +1,81 @@
require 'test/unit'
HTML5LIB_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
HTML5LIB_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
if File.exists?(File.join(HTML5LIB_BASE, 'testdata'))
TESTDATA_DIR = File.join(HTML5LIB_BASE, 'testdata')
else
TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
end
$:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
$:.unshift File.dirname(__FILE__)
def html5lib_test_files(subdirectory)
Dir[File.join(HTML5LIB_BASE, 'tests', subdirectory, '*.*')]
Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
end
begin
require 'jsonx'
require 'rubygems'
require 'json'
rescue LoadError
class JSON
def self.parse json
json.gsub! /"\s*:/, '"=>'
json.gsub!(/"\s*:/, '"=>')
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
null = nil
eval json
end
end
end
module HTML5lib
module TestSupport
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
match.split("\n").sort.join("\n")
end
end
end
end

View file

@ -4,33 +4,32 @@ require 'html5lib/inputstream'
class Html5EncodingTestCase < Test::Unit::TestCase
begin
begin
require 'rubygems'
require 'UniversalDetector'
def test_chardet
File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file|
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.char_encoding.downcase
end
file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r')
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.char_encoding.downcase
rescue LoadError
puts "chardet not found, skipping chardet tests"
end
rescue LoadError
puts "chardet not found, skipping chardet tests"
end
end
html5lib_test_files('encoding').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
html5lib_test_files('encoding').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
input, encoding = data.split(/\n#encoding\s+/, 2)
encoding = encoding.split[0]
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
input, encoding = data.split(/\n#encoding\s+/, 2)
encoding = encoding.split[0]
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.char_encoding.downcase, input
end
end
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.char_encoding.downcase, input
end
end
end
end

View file

@ -191,13 +191,13 @@ EOX
end
def test_br
assert_xhtml_equal <<EOX
assert_xhtml_equal <<EOX1
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>XLINK</title></head>
<body>
<br/>
</body></html>
EOX
EOX1
end
def xtest_strong

View file

@ -12,55 +12,14 @@ begin
rescue LoadError
end
$CHECK_PARSER_ERRORS = false
$CHECK_PARSER_ERRORS = ARGV.delete('-p') # TODO
puts 'Testing: ' + $tree_types_to_test * ', '
puts 'Testing tree builders: ' + $tree_types_to_test * ', '
class Html5ParserTestCase < Test::Unit::TestCase
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") }
end
include HTML5lib
include TestSupport
html5lib_test_files('tree-construction').each do |test_file|
@ -69,12 +28,13 @@ class Html5ParserTestCase < Test::Unit::TestCase
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
innerHTML, input, expected_output, expected_errors = parseTestcase(data)
innerHTML, input, expected_output, expected_errors =
TestSupport.parseTestcase(data)
$tree_types_to_test.each do |tree_name|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name))
parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
if innerHTML
parser.parseFragment(input, innerHTML)
@ -85,16 +45,17 @@ class Html5ParserTestCase < Test::Unit::TestCase
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
assert_equal sortattrs(expected_output), sortattrs(actual_output), [
'Input:', input,
'Expected:', expected_output,
'Recieved:', actual_output
'', 'Input:', input,
'', 'Expected:', expected_output,
'', 'Recieved:', actual_output
].join("\n")
if $CHECK_PARSER_ERRORS
actual_errors = parser.errors.map do |(line, col), message|
'Line: %i Col: %i %s' % [line, col, message]
end
assert_equal parser.errors.length, expected_errors.length, [
assert_equal expected_errors.length, parser.errors.length, [
'Input', input + "\n",
'Expected errors:', expected_errors.join("\n"),
'Actual errors:', actual_errors.join("\n")
].join("\n")

View file

@ -2,209 +2,145 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/sanitizer'
require 'html5lib/html5parser'
require 'html5lib/liberalxmlparser'
require 'html5lib/treewalkers'
require 'html5lib/serializer'
require 'html5lib/sanitizer'
class SanitizeTest < Test::Unit::TestCase
include HTML5lib
def sanitize_xhtml stream
XHTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).to_s
end
def sanitize_html stream
HTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
HTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).to_s
end
def sanitize_rexml stream
require 'rexml/document'
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
tokens = TreeWalkers.getTreeWalker('rexml').new(doc)
HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
:quote_attr_values => true,
:quote_char => "'",
:minimize_boolean_attributes => false,
:use_trailing_solidus => true,
:omit_optional_tags => false,
:inject_meta_charset => false,
:sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
rescue REXML::ParseException
return "Ill-formed XHTML!"
end
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
assert_equal htmloutput, sanitize_html(input)
assert_equal xhtmloutput, sanitize_xhtml(input)
assert_equal rexmloutput, sanitize_rexml(input)
end
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
next if %w[caption col colgroup optgroup option table tbody td tfoot th thead tr].include?(tag_name) ### TODO
define_method "test_should_allow_#{tag_name}_tag" do
if tag_name == 'image'
assert_equal "<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
rexmloutput = xhtmloutput
if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
xhtmloutput = htmloutput
elsif tag_name == 'col'
htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
xhtmloutput = htmloutput
rexmloutput = "<col title='1' />"
elsif tag_name == 'table'
htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
xhtmloutput = htmloutput
elsif tag_name == 'image'
htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
xhtmloutput = htmloutput
rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
elsif VOID_ELEMENTS.include?(tag_name)
assert_equal "<#{tag_name} title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
else
assert_equal "<#{tag_name.downcase} title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>",
sanitize_html("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
assert_equal "<#{tag_name} title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>",
sanitize_xhtml("<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>")
htmloutput = "<#{tag_name} title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
xhtmloutput = htmloutput
htmloutput += '<br/>' if tag_name == 'br'
rexmloutput = "<#{tag_name} title='1' />"
end
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
end
end
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
assert_equal "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;",
sanitize_html("<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>")
input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
check_sanitization(input, output, output, output)
end
end
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
next if attribute_name == 'style'
define_method "test_should_allow_#{attribute_name}_attribute" do
assert_equal "<p #{attribute_name.downcase}=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
sanitize_html("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
assert_equal "<p #{attribute_name}=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
sanitize_xhtml("<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>")
input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
check_sanitization(input, htmloutput, output, output)
end
end
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
assert_equal "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
sanitize_html("<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>")
input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
check_sanitization(input, output, output, output)
end
end
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_#{protocol}_uris" do
assert_equal "<a href=\"#{protocol}\">foo</a>",
sanitize_html(%(<a href="#{protocol}">foo</a>))
input = %(<a href="#{protocol}">foo</a>)
output = "<a href='#{protocol}'>foo</a>"
check_sanitization(input, output, output, output)
end
end
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
assert_equal "<a href=\"#{protocol.upcase}\">foo</a>",
sanitize_html(%(<a href="#{protocol.upcase}">foo</a>))
input = %(<a href="#{protocol.upcase}">foo</a>)
output = "<a href='#{protocol.upcase}'>foo</a>"
check_sanitization(input, output, output, output)
end
end
def test_should_allow_anchors
assert_equal "<a href=\"foo\">&lt;script&gt;baz&lt;/script&gt;</a>",
sanitize_html("<a href='foo' onclick='bar'><script>baz</script></a>")
end
# RFC 3986, sec 4.2
def test_allow_colons_in_path_component
assert_equal "<a href=\"./this:that\">foo</a>",
sanitize_html("<a href=\"./this:that\">foo</a>")
end
%w(src width height alt).each do |img_attr|
define_method "test_should_allow_image_#{img_attr}_attribute" do
assert_equal "<img #{img_attr}=\"foo\"/>",
sanitize_html("<img #{img_attr}='foo' onclick='bar' />")
end
end
def test_should_handle_non_html
assert_equal 'abc', sanitize_html("abc")
end
def test_should_handle_blank_text
assert_equal '', sanitize_html('')
end
[%w(img src), %w(a href)].each do |(tag, attr)|
close = VOID_ELEMENTS.include?(tag) ? "/>boo" : ">boo</#{tag}>"
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols" do
assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}="javascript:XSS" title="1">boo</#{tag}>))
end
define_method "test_should_strip_#{attr}_attribute_in_#{tag}_with_bad_protocols_and_whitespace" do
assert_equal %(<#{tag} title="1"#{close}), sanitize_html(%(<#{tag} #{attr}=" javascript:XSS" title="1">boo</#{tag}>))
end
end
[%(<img src="javascript:alert('XSS');" />),
%(<img src=javascript:alert('XSS') />),
%(<img src="JaVaScRiPt:alert('XSS')" />),
%(<img src='javascript:alert(&quot;XSS&quot;)' />),
%(<img src='javascript:alert(String.fromCharCode(88,83,83))' />),
%(<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />),
%(<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />),
%(<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />),
%(<img src="jav\tascript:alert('XSS');" />),
%(<img src="jav&#x09;ascript:alert('XSS');" />),
%(<img src="jav&#x0A;ascript:alert('XSS');" />),
%(<img src="jav&#x0D;ascript:alert('XSS');" />),
%(<img src=" &#14; javascript:alert('XSS');" />),
%(<img src="&#x20;javascript:alert('XSS');" />),
%(<img src="&#xA0;javascript:alert('XSS');" />)].each_with_index do |img_hack, i|
define_method "test_should_not_fall_for_xss_image_hack_#{i}" do
assert_equal "<img/>", sanitize_html(img_hack)
end
end
def test_should_sanitize_tag_broken_up_by_null
assert_equal "&lt;scr\357\277\275ipt&gt;alert(\"XSS\")&lt;/scr\357\277\275ipt&gt;", sanitize_html(%(<scr\0ipt>alert(\"XSS\")</scr\0ipt>))
end
def test_should_sanitize_invalid_script_tag
assert_equal "&lt;script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;", sanitize_html(%(<script/XSS SRC="http://ha.ckers.org/xss.js"></script>))
end
def test_should_sanitize_script_tag_with_multiple_open_brackets
assert_equal "&lt;&lt;script&gt;alert(\"XSS\");//&lt;&lt;/script&gt;", sanitize_html(%(<<script>alert("XSS");//<</script>))
assert_equal %(&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;), sanitize_html(%(<iframe src=http://ha.ckers.org/scriptlet.html\n<))
end
def test_should_sanitize_unclosed_script
assert_equal "&lt;script src=\"http://ha.ckers.org/xss.js?\"&gt;<b/>", sanitize_html(%(<script src=http://ha.ckers.org/xss.js?<b>))
end
def test_should_sanitize_half_open_scripts
assert_equal "<img/>", sanitize_html(%(<img src="javascript:alert('XSS')"))
end
def test_should_not_fall_for_ridiculous_hack
img_hack = %(<img\nsrc\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n />)
assert_equal "<img/>", sanitize_html(img_hack)
end
def test_platypus
assert_equal %(<a href=\"http://www.ragingplatypus.com/\" style=\"display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;\">never trust your upstream platypus</a>),
sanitize_html(%(<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>))
end
def test_xul
assert_equal %(<p style="">fubar</p>),
sanitize_html(%(<p style="-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')">fubar</p>))
end
def test_input_image
assert_equal %(<input type="image"/>),
sanitize_html(%(<input type="image" src="javascript:alert('XSS');" />))
end
def test_non_alpha_non_digit
assert_equal "&lt;script XSS=\"\" src=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;",
sanitize_html(%(<script/XSS src="http://ha.ckers.org/xss.js"></script>))
assert_equal "<a>foo</a>",
sanitize_html('<a onclick!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>foo</a>')
assert_equal "<img src=\"http://ha.ckers.org/xss.js\"/>",
sanitize_html('<img/src="http://ha.ckers.org/xss.js"/>')
end
def test_img_dynsrc_lowsrc
assert_equal "<img/>",
sanitize_html(%(<img dynsrc="javascript:alert('XSS')" />))
assert_equal "<img/>",
sanitize_html(%(<img lowsrc="javascript:alert('XSS')" />))
end
def test_div_background_image_unicode_encoded
assert_equal '<div style="">foo</div>',
sanitize_html(%(<div style="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">foo</div>))
end
def test_div_expression
assert_equal '<div style="">foo</div>',
sanitize_html(%(<div style="width: expression(alert('XSS'));">foo</div>))
end
def test_img_vbscript
assert_equal '<img/>',
sanitize_html(%(<img src='vbscript:msgbox("XSS")' />))
end
def test_should_handle_astral_plane_characters
assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
input = "<p>&#x1d4b5; &#x1d538;</p>"
output = "<p>\360\235\222\265 \360\235\224\270</p>"
check_sanitization(input, output, output, output)
input = "<p><tspan>\360\235\224\270</tspan> a</p>"
output = "<p><tspan>\360\235\224\270</tspan> a</p>"
check_sanitization(input, output, output, output)
end
# This affects only NS4. Is it worth fixing?
# def test_javascript_includes
# input = %(<div size="&{alert('XSS')}">foo</div>)
# output = "<div>foo</div>"
# check_sanitization(input, output, output, output)
# end
html5lib_test_files('sanitizer').each do |filename|
JSON::parse(open(filename).read).each do |test|
define_method "test_#{test['name']}" do
check_sanitization(
test['input'],
test['output'],
test['xhtml'] || test['output'],
test['rexml'] || test['output']
)
end
end
end
end

View file

@ -0,0 +1,68 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser'
require 'html5lib/serializer'
require 'html5lib/treewalkers'
#Run the serialize error checks
checkSerializeErrors = false
class JsonWalker < HTML5lib::TreeWalkers::Base
def each
@tree.each do |token|
case token[0]
when 'StartTag'
yield startTag(token[1], token[2])
when 'EndTag'
yield endTag(token[1])
when 'EmptyTag'
yield emptyTag(token[1], token[2])
when 'Comment'
yield comment(token[1])
when 'Characters', 'SpaceCharacters'
text(token[1]) {|textToken| yield textToken}
when 'Doctype'
yield doctype(token[1])
else
raise "Unknown token type: " + token[0]
end
end
end
end
class Html5SerializeTestcase < Test::Unit::TestCase
html5lib_test_files('serializer').each do |filename|
test_name = File.basename(filename).sub('.test', '')
tests = JSON::parse(open(filename).read)
tests['tests'].each_with_index do |test, index|
define_method "test_#{test_name}_#{index+1}" do
if test["options"] and test["options"]["encoding"]
test["options"][:encoding] = test["options"]["encoding"]
end
result = HTML5lib::HTMLSerializer.
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
expected = test["expected"]
if expected.length == 1
assert_equal(expected[0], result, test["description"])
elsif !expected.include?(result)
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
end
return if test_name == 'optionaltags'
result = HTML5lib::XHTMLSerializer.
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
expected = test["xhtml"] || test["expected"]
if expected.length == 1
assert_equal(expected[0], result, test["description"])
elsif !expected.include?(result)
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
end
end
end
end
end

62
vendor/plugins/HTML5lib/tests/test_stream.rb vendored Executable file
View file

@ -0,0 +1,62 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/inputstream'
class HTMLInputStreamTest < Test::Unit::TestCase
include HTML5lib
def test_char_ascii
stream = HTMLInputStream.new("'", :encoding=>'ascii')
assert_equal('ascii', stream.char_encoding)
assert_equal("'", stream.char)
end
def test_char_null
stream = HTMLInputStream.new("\x00")
assert_equal("\xef\xbf\xbd", stream.char)
end
def test_char_utf8
stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
assert_equal('utf-8', stream.char_encoding)
assert_equal("\xe2\x80\x98", stream.char)
end
def test_char_win1252
stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
assert_equal('windows-1252', stream.char_encoding)
assert_equal("\xc2\xa2", stream.char)
assert_equal("\xc3\x85", stream.char)
assert_equal("\xc3\xb1", stream.char)
assert_equal("\xe2\x80\x99", stream.char)
assert_equal("\xe2\x80\xa0", stream.char)
end
def test_bom
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
assert_equal('utf-8', stream.char_encoding)
assert_equal("'", stream.char)
end
begin
require 'iconv'
def test_utf_16
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
assert(stream.char_encoding, 'utf-16-le')
assert_equal(1025, stream.chars_until(' ',true).length)
end
rescue LoadError
puts "iconv not found, skipping iconv tests"
end
def test_newlines
stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
assert_equal([1,0], stream.position)
assert_equal("a\nbb\n", stream.chars_until('c'))
assert_equal([3,0], stream.position)
assert_equal("ccc\ndddd", stream.chars_until('x'))
assert_equal([4,4], stream.position)
assert_equal([1,2,3], stream.instance_eval {@line_lengths})
end
end

View file

@ -30,9 +30,10 @@ class Html5TokenizerTestCase < Test::Unit::TestCase
def tokenizer_test(data)
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
message = [
'Description:', data['description'],
'Input:', data['input'],
'Content Model Flag:', content_model_flag ] * "\n"
'', 'Description:', data['description'],
'', 'Input:', data['input'],
'', 'Content Model Flag:', content_model_flag,
'' ] * "\n"
assert_nothing_raised message do
tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])

View file

@ -0,0 +1,113 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser'
require 'html5lib/treewalkers'
require 'html5lib/treebuilders'
$tree_types_to_test = {
'simpletree' =>
{:builder => HTML5lib::TreeBuilders['simpletree'],
:walker => HTML5lib::TreeWalkers['simpletree']},
'rexml' =>
{:builder => HTML5lib::TreeBuilders['rexml'],
:walker => HTML5lib::TreeWalkers['rexml']},
'hpricot' =>
{:builder => HTML5lib::TreeBuilders['hpricot'],
:walker => HTML5lib::TreeWalkers['hpricot']},
}
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
class TestTreeWalkers < Test::Unit::TestCase
include HTML5lib::TestSupport
def concatenateCharacterTokens(tokens)
charactersToken = nil
for token in tokens
type = token[:type]
if [:Characters, :SpaceCharacters].include?(type)
if charactersToken == nil
charactersToken = {:type => :Characters, :data => token[:data]}
else
charactersToken[:data] += token[:data]
end
else
if charactersToken != nil
yield charactersToken
charactersToken = nil
end
yield token
end
end
yield charactersToken if charactersToken != nil
end
def convertTokens(tokens)
output = []
indent = 0
concatenateCharacterTokens(tokens) do |token|
case token[:type]
when :StartTag, :EmptyTag
output << "#{' '*indent}<#{token[:name]}>"
indent += 2
for name, value in token[:data].to_a.sort
next if name=='xmlns'
output << "#{' '*indent}#{name}=\"#{value}\""
end
indent -= 2 if token[:type] == :EmptyTag
when :EndTag
indent -= 2
when :Comment
output << "#{' '*indent}<!-- #{token[:data]} -->"
when :Doctype
output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
when :Characters, :SpaceCharacters
output << "#{' '*indent}\"#{token[:data]}\""
else
# TODO: what to do with errors?
end
end
return output.join("\n")
end
html5lib_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '')
next if test_name == 'tests5' # TODO
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
innerHTML, input, expected_output, expected_errors =
HTML5lib::TestSupport::parseTestcase(data)
$tree_types_to_test.each do |tree_name, tree_class|
define_method "test_#{test_name}_#{index}_#{tree_name}" do
parser = HTML5lib::HTMLParser.new(:tree => tree_class[:builder])
if innerHTML
parser.parseFragment(input, innerHTML)
else
parser.parse(input)
end
document = parser.tree.getDocument
begin
output = sortattrs(convertTokens(tree_class[:walker].new(document)))
expected = sortattrs(expected_output)
assert_equal expected, output, [
'', 'Input:', input,
'', 'Expected:', expected,
'', 'Recieved:', output
].join("\n")
rescue NotImplementedError
# Amnesty for those that confess...
end
end
end
end
end
end

View file

@ -11,14 +11,15 @@ class TokenizerTestParser
debug = nil
for token in @tokenizer
debug = token.inspect if token[:type] == :ParseError
send ('process' + token[:type].to_s), token
send(('process' + token[:type].to_s), token)
end
return @outputTokens
end
def processDoctype(token)
@outputTokens.push(["DOCTYPE", token[:name], token[:data]])
@outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
token[:systemId], token[:correct]])
end
def processStartTag(token)

View file

@ -7,6 +7,7 @@ module ActionController
end
module Caching
module Actions
# All documentation is keeping DRY in the plugin's README
@ -17,7 +18,7 @@ module ActionController
end
def expire_one_action(options)
expire_fragment(Regexp.new(".*/" + ActionCachePath.path_for(self, options) + ".*"))
expire_fragment(Regexp.new(".*/" + Regexp.escape(ActionCachePath.path_for(self, options)) + ".*"))
end
def expire_action(options = {})
@ -134,7 +135,7 @@ module ActionController
controller.response.headers['Cache-Control'] == 'no-cache'
controller.response.headers['Cache-Control'] = "max-age=#{controller.response.time_to_live}"
end
controller.response.headers['Etag'] = "\"#{MD5.new(controller.response.body).to_s}\""
controller.response.headers['Etag'] = %{"#{MD5.new(controller.response.body).to_s}"}
controller.response.headers['Last-Modified'] ||= Time.now.httpdate
end
@ -147,7 +148,7 @@ module ActionController
def send_not_modified(controller)
controller.logger.info "Send Not Modified"
controller.response.headers['Etag'] = "\"#{MD5.new(fragment_body(controller)).to_s}\""
controller.response.headers['Etag'] = %{"#{MD5.new(fragment_body(controller)).to_s}"}
controller.render(:text => "", :status => 304)
end

View file

@ -154,6 +154,22 @@ Example:
CSS: style.css math.css
=end
# Render to an HTML fragment (returns a REXML document tree)
def to_html_tree
div = Element.new 'div'
div.attributes['class'] = 'maruku_wrapper_div'
children_to_html.each do |e|
div << e
end
# render footnotes
if @doc.footnotes_order.size > 0
div << render_footnotes
end
doc = Document.new(nil,{:respect_whitespace =>:all})
doc << div
end
# Render to a complete HTML document (returns a REXML document tree)
def to_html_document_tree

View file

@ -365,7 +365,7 @@ Otherwise, a standard `verbatim` environment is used.
color = get_setting(:code_background_color)
colorspec = latex_color(color, 'colorbox')
"#{colorspec}{\\tt #{s}}"
"{#{colorspec}{\\tt #{s}}}"
end
def to_latex_immediate_link