Merged with Jacques' latest changes.

This commit is contained in:
Jason Blevins 2007-09-03 09:14:51 -04:00
commit b96ff30026
111 changed files with 12210 additions and 3632 deletions

View file

@ -152,8 +152,7 @@ class ApplicationController < ActionController::Base
elsif %w(tex).include?(action_name) elsif %w(tex).include?(action_name)
response.headers['Content-Type'] = 'text/plain; charset=UTF-8' response.headers['Content-Type'] = 'text/plain; charset=UTF-8'
elsif request.env['HTTP_USER_AGENT'] =~ /Validator/ or request.env.include?('HTTP_ACCEPT') && elsif request.env['HTTP_USER_AGENT'] =~ /Validator/ or request.env.include?('HTTP_ACCEPT') &&
Mime::Type.parse(request.env["HTTP_ACCEPT"]).include?(Mime::XHTML) && Mime::Type.parse(request.env["HTTP_ACCEPT"]).include?(Mime::XHTML)
!(request.env['HTTP_USER_AGENT'] =~ /Safari/ and %w(s5).include?(action_name))
response.headers['Content-Type'] = 'application/xhtml+xml; charset=UTF-8' response.headers['Content-Type'] = 'application/xhtml+xml; charset=UTF-8'
elsif request.env['HTTP_USER_AGENT'] =~ /MathPlayer/ elsif request.env['HTTP_USER_AGENT'] =~ /MathPlayer/
response.headers['Content-Type'] = 'application/xhtml+xml' response.headers['Content-Type'] = 'application/xhtml+xml'

View file

@ -18,7 +18,7 @@ xml.feed('xmlns' => "http://www.w3.org/2005/Atom", "xml:lang" => 'en') do
xml.name(page.author) xml.name(page.author)
end end
if @hide_description if @hide_description
xml.summary('Content suppressed.', 'type' => 'text') xml.summary("Updated by #{page.author} on #{page.updated_at.getgm.strftime("%Y-%m-%d")} at #{page.updated_at.getgm.strftime("%H:%M:%SZ")}.", 'type' => 'text')
else else
xml.content('type' => 'xhtml', 'xml:base' => url_for(:only_path => false, :web => @web_name, :action => @link_action, :id => page.name) ) do xml.content('type' => 'xhtml', 'xml:base' => url_for(:only_path => false, :web => @web_name, :action => @link_action, :id => page.name) ) do
xml.div('xmlns' => 'http://www.w3.org/1999/xhtml' ) do xml.div('xmlns' => 'http://www.w3.org/1999/xhtml' ) do

View file

@ -11,6 +11,16 @@
%----Macros---------- %----Macros----------
\newcommand{\gt}{>} \newcommand{\gt}{>}
\newcommand{\lt}{<} \newcommand{\lt}{<}
\newcommand{\darr}{\downarrow}
\newcommand{\nearr}{\nearrow}
\newcommand{\nwarr}{\nwarrow}
\newcommand{\searr}{\searrow}
\newcommand{\swarr}{\swarrow}
\newcommand{\iff}{\Longleftrightarrow}
\newcommand{\impliedby}{\Leftarrow}
\newcommand{\map}{\mapsto}
\newcommand{\embedsin}{\hookrightarrow}
\newcommand{\implies}{\Rightarrow}
\newcommand{\qed}{\blacksquare} \newcommand{\qed}{\blacksquare}
%------------------------------------------------------------------- %-------------------------------------------------------------------

View file

@ -16,7 +16,7 @@ class Category < Chunk::Abstract
def initialize(match_data, content) def initialize(match_data, content)
super(match_data, content) super(match_data, content)
@hidden = match_data[1] @hidden = match_data[1]
@list = match_data[2].split(',').map { |c| c.strip } @list = match_data[2].split(',').map { |c| html_escape(c.strip) }
@unmask_text = '' @unmask_text = ''
if @hidden if @hidden
@unmask_text = '' @unmask_text = ''

View file

@ -74,6 +74,13 @@ module Chunk
@content.delete_chunk(self) @content.delete_chunk(self)
end end
def html_escape(string)
string.gsub( /&/, "&amp;" ).
gsub( /</, "&lt;" ).
gsub( />/, "&gt;" ).
gsub( /"/, "&quot;" )
end
end end
end end

View file

@ -25,14 +25,14 @@
module Sanitize module Sanitize
require 'html5lib/html5parser' require 'html5/html5parser'
require 'html5lib/liberalxmlparser' require 'html5/liberalxmlparser'
require 'html5lib/treewalkers' require 'html5/treewalkers'
require 'html5lib/treebuilders' require 'html5/treebuilders'
require 'html5lib/serializer' require 'html5/serializer'
require 'html5lib/sanitizer' require 'html5/sanitizer'
include HTML5lib include HTML5
# Sanitize a string, parsed using XHTML parsing rules. # Sanitize a string, parsed using XHTML parsing rules.
# #
@ -52,12 +52,12 @@ module Sanitize
options.each do |name, value| options.each do |name, value|
next unless %w(encoding treebuilder to_tree).include? name.to_s next unless %w(encoding treebuilder to_tree).include? name.to_s
if name.to_s == 'treebuilder' if name.to_s == 'treebuilder'
@treebuilder = HTML5lib::TreeBuilders.getTreeBuilder(value) @treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
else else
instance_variable_set("@#{name}", value) instance_variable_set("@#{name}", value)
end end
end end
parsed = XHTMLParser.parseFragment(html.to_ncr, {:tokenizer => HTMLSanitizer, parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder }) :encoding => @encoding, :tree => @treebuilder })
return parsed if @to_tree return parsed if @to_tree
return parsed.to_s return parsed.to_s
@ -81,12 +81,12 @@ module Sanitize
options.each do |name, value| options.each do |name, value|
next unless %w(encoding treebuilder to_tree).include? name.to_s next unless %w(encoding treebuilder to_tree).include? name.to_s
if name.to_s == 'treebuilder' if name.to_s == 'treebuilder'
@treebuilder = HTML5lib::TreeBuilders.getTreeBuilder(value) @treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
else else
instance_variable_set("@#{name}", value) instance_variable_set("@#{name}", value)
end end
end end
parsed = HTMLParser.parseFragment(html.to_ncr, {:tokenizer => HTMLSanitizer, parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder }) :encoding => @encoding, :tree => @treebuilder })
return parsed if @to_tree return parsed if @to_tree
return parsed.to_s return parsed.to_s
@ -98,13 +98,9 @@ module Sanitize
# sanitize_rexml(tree) -> string # sanitize_rexml(tree) -> string
# #
def sanitize_rexml(tree) def sanitize_rexml(tree)
tokens = TreeWalkers.getTreeWalker('rexml').new(tree.to_ncr) tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_ncr)
HTMLSerializer.serialize(tokens, {:encoding=>'utf-8', XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
:quote_attr_values => true,
:minimize_boolean_attributes => false,
:use_trailing_solidus => true,
:space_before_trailing_solidus => true, :space_before_trailing_solidus => true,
:omit_optional_tags => false,
:inject_meta_charset => false, :inject_meta_charset => false,
:sanitize => true}) :sanitize => true})
end end

View file

@ -16,4 +16,4 @@ table.plaintable {
text-align:center; text-align:center;
margin-left:30px; margin-left:30px;
} }
.noborder td, .noborder th {border:0}

View file

@ -1,6 +1,6 @@
/* Following are the presentation styles -- edit away! */ /* Following are the presentation styles -- edit away! */
body {background: #FFF; color: #000; font-size: 2em;} body {background: #FFF; color: #000; font-size: 1.6em;}
:link, :visited {text-decoration: none; color: #00C;} :link, :visited {text-decoration: none; color: #00C;}
#controls :active {color: #8A8 !important;} #controls :active {color: #8A8 !important;}
#controls :focus {outline: 1px dotted #272;} #controls :focus {outline: 1px dotted #272;}

View file

@ -1,4 +1,5 @@
// S5 v1.2a1 slides.js -- released into the Public Domain // S5 v1.2a2 slides.js -- released into the Public Domain
// Many modifications by Jacques Distler to allow operation as real XHTML.
// //
// Please see http://www.meyerweb.com/eric/tools/s5/credits.html for information // Please see http://www.meyerweb.com/eric/tools/s5/credits.html for information
// about all the wonderful and talented contributors to this code! // about all the wonderful and talented contributors to this code!
@ -30,6 +31,7 @@ var countdown = {
var isIE = navigator.appName == 'Microsoft Internet Explorer' && navigator.userAgent.indexOf('Opera') < 1 ? 1 : 0; var isIE = navigator.appName == 'Microsoft Internet Explorer' && navigator.userAgent.indexOf('Opera') < 1 ? 1 : 0;
var isOp = navigator.userAgent.indexOf('Opera') > -1 ? 1 : 0; var isOp = navigator.userAgent.indexOf('Opera') > -1 ? 1 : 0;
var isSa = navigator.userAgent.indexOf('Safari') > -1 ? 1 : 0;
var isGe = navigator.userAgent.indexOf('Gecko') > -1 && navigator.userAgent.indexOf('Safari') < 1 ? 1 : 0; var isGe = navigator.userAgent.indexOf('Gecko') > -1 && navigator.userAgent.indexOf('Safari') < 1 ? 1 : 0;
function hasClass(object, className) { function hasClass(object, className) {
@ -111,8 +113,15 @@ function slideLabel() {
for (var o = 0; o < menunodes.length; o++) { for (var o = 0; o < menunodes.length; o++) {
otext += nodeValue(menunodes[o]); otext += nodeValue(menunodes[o]);
} }
if (isSa) {
var option = createElement('option');
option.setAttribute('value', n);
option.appendChild(document.createTextNode(n + ' : ' + otext) );
list.appendChild(option);
} else {
list.options[list.length] = new Option(n + ' : ' + otext, n); list.options[list.length] = new Option(n + ' : ' + otext, n);
} }
}
} }
function currentSlide() { function currentSlide() {
@ -122,12 +131,12 @@ function currentSlide() {
} else { } else {
cs = document.currentSlide; cs = document.currentSlide;
} }
var plink = document.createElement('a'); var plink = createElement('a');
plink.id = 'plink'; plink.id = 'plink';
plink.setAttribute('href', ''); plink.setAttribute('href', '');
var csHere = document.createElement('span'); var csHere = createElement('span');
var csSep = document.createElement('span'); var csSep = createElement('span');
var csTotal = document.createElement('span'); var csTotal = createElement('span');
csHere.id = 'csHere'; csHere.id = 'csHere';
csSep.id = 'csSep'; csSep.id = 'csSep';
csTotal.id = 'csTotal'; csTotal.id = 'csTotal';
@ -376,7 +385,7 @@ function slideJump() {
function fixLinks() { function fixLinks() {
var thisUri = window.location.href; var thisUri = window.location.href;
thisUri = thisUri.slice(0, thisUri.length - window.location.hash.length); thisUri = thisUri.slice(0, thisUri.length - window.location.hash.length);
var aelements = document.getElementsByTagName('A'); var aelements = document.getElementsByTagName('a');
for (var i = 0; i < aelements.length; i++) { for (var i = 0; i < aelements.length; i++) {
var a = aelements[i].href; var a = aelements[i].href;
var slideID = a.match('\#slide[0-9]{1,2}'); var slideID = a.match('\#slide[0-9]{1,2}');
@ -418,43 +427,43 @@ function permaLink() {
function createControls() { function createControls() {
var controlsDiv = document.getElementById("controls"); var controlsDiv = document.getElementById("controls");
if (!controlsDiv) return; if (!controlsDiv) return;
var controlForm = document.createElement('form'); var controlForm = createElement('form');
controlForm.id = 'controlForm'; controlForm.id = 'controlForm';
controlForm.setAttribute('action', '#'); controlForm.setAttribute('action', '#');
if (controlVis == 'hidden') { if (controlVis == 'hidden') {
controlForm.setAttribute('onmouseover', 'showHide(\'s\');'); controlForm.setAttribute('onmouseover', 'showHide(\'s\');');
controlForm.setAttribute('onmouseout', 'showHide(\'h\');'); controlForm.setAttribute('onmouseout', 'showHide(\'h\');');
} }
var navLinks = document.createElement('div'); var navLinks = createElement('div');
navLinks.id = 'navLinks'; navLinks.id = 'navLinks';
var showNotes = document.createElement('a'); var showNotes = createElement('a');
showNotes.id = 'show-notes'; showNotes.id = 'show-notes';
showNotes.setAttribute('accesskey', 'n'); showNotes.setAttribute('accesskey', 'n');
showNotes.setAttribute('href', 'javascript:createNotesWindow();'); showNotes.setAttribute('href', 'javascript:createNotesWindow();');
showNotes.setAttribute('title', 'Show Notes'); showNotes.setAttribute('title', 'Show Notes');
showNotes.appendChild(document.createTextNode('\u2261')); showNotes.appendChild(document.createTextNode('\u2261'));
var toggle = document.createElement('a'); var toggle = createElement('a');
toggle.id = 'toggle'; toggle.id = 'toggle';
toggle.setAttribute('accesskey', 't'); toggle.setAttribute('accesskey', 't');
toggle.setAttribute('href', 'javascript:toggle();'); toggle.setAttribute('href', 'javascript:toggle();');
toggle.appendChild(document.createTextNode('\u00D8')); toggle.appendChild(document.createTextNode('\u00D8'));
var prev = document.createElement('a'); var prev = createElement('a');
prev.id = 'prev'; prev.id = 'prev';
prev.setAttribute('accesskey', 'z'); prev.setAttribute('accesskey', 'z');
prev.setAttribute('href', 'javascript:go(-1);'); prev.setAttribute('href', 'javascript:go(-1);');
prev.appendChild(document.createTextNode('\u00AB')); prev.appendChild(document.createTextNode('\u00AB'));
var next = document.createElement('a'); var next = createElement('a');
next.id = 'next'; next.id = 'next';
next.setAttribute('accesskey', 'x'); next.setAttribute('accesskey', 'x');
next.setAttribute('href', 'javascript:go(1);'); next.setAttribute('href', 'javascript:go(1);');
next.appendChild(document.createTextNode('\u00BB')); next.appendChild(document.createTextNode('\u00BB'));
var navList = document.createElement('div'); var navList = createElement('div');
navList.id = 'navList'; navList.id = 'navList';
if (controlVis != 'hidden') { if (controlVis != 'hidden') {
navList.setAttribute('onmouseover', 'showHide(\'s\');'); navList.setAttribute('onmouseover', 'showHide(\'s\');');
navList.setAttribute('onmouseout', 'showHide(\'h\');'); navList.setAttribute('onmouseout', 'showHide(\'h\');');
} }
var jumplist = document.createElement('select'); var jumplist = createElement('select');
jumplist.id = 'jumplist'; jumplist.id = 'jumplist';
jumplist.setAttribute('onchange', 'go(\'j\');'); jumplist.setAttribute('onchange', 'go(\'j\');');
navList.appendChild(jumplist); navList.appendChild(jumplist);
@ -503,7 +512,7 @@ function fontScale() { // causes layout problems in FireFox that get fixed if b
function fontSize(value) { function fontSize(value) {
if (!(s5ss = document.getElementById('s5ss'))) { if (!(s5ss = document.getElementById('s5ss'))) {
if (!document.createStyleSheet) { if (!document.createStyleSheet) {
document.getElementsByTagName('head')[0].appendChild(s5ss = document.createElement('style')); document.getElementsByTagName('head')[0].appendChild(s5ss = createElement('style'));
s5ss.setAttribute('media','screen, projection'); s5ss.setAttribute('media','screen, projection');
s5ss.setAttribute('id','s5ss'); s5ss.setAttribute('id','s5ss');
} else { } else {
@ -784,6 +793,14 @@ function readTime(val) {
} }
} }
function createElement(element) {
if (typeof document.createElementNS != 'undefined') {
return document.createElementNS('http://www.w3.org/1999/xhtml', element);
} else {
return document.createElement(element);
}
}
function windowChange() { function windowChange() {
fontScale(); fontScale();
} }

View file

@ -0,0 +1,64 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f.dtd" >
<!-- Do not edit this document! The system will likely break if you do. -->
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Notes</title>
<link rel="stylesheet" href="default/notes.css" type="text/css" />
<script type="text/javascript">
// <![CDATA[
document.onkeyup = opener.keys;
document.onkeypress = opener.trap;
document.onclick = opener.clicker;
// ]]>
</script>
</head>
<body onload="opener.s5NotesWindowLoaded=true;" onunload="opener.s5NotesWindowLoaded=false;">
<div class="timers" id="elapsed">
<h1>
<a href="#" onclick="opener.minimizeTimer('elapsed'); return false;">Elapsed Time</a>
</h1>
<ul>
<li>
<h2>Presentation</h2>
<span class="clock" id="elapsed-presentation">00:00:00</span>
</li>
<li>
<h2>Current Slide</h2>
<span class="clock" id="elapsed-slide">00:00:00</span>
</li>
</ul>
<div class="controls">
<a href="#reset-elapsed" onclick="opener.resetElapsedTime(); return false;" title="Reset Elapsed Time">|&larr;</a>
</div>
</div>
<div class="timers" id="remaining">
<h1>
<a href="#" onclick="opener.minimizeTimer('remaining'); return false;">Remaining Time</a>
</h1>
<p>
<a href="#subtract-remaining" class="control" id="minus" onclick="opener.alterRemainingTime('-5'); return false;" title="Subtract 5 Minutes">-</a>
<span class="clock" id="timeLeft">00:00:00</span>
<a href="#add-remaining" class="control" id="plus" onclick="opener.alterRemainingTime('5'); return false;" title="Add 5 Minutes">+</a>
</p>
<div class="controls">
<form action="#" onsubmit="opener.resetRemainingTime(); return false;">
<input type="text" class="text" id="startFrom" value="0" size="4" maxlength="4" />
<a href="#toggle-remaining" onclick="opener.toggleRemainingTime(); return false;" title="Pause/Run Remaining Time">||</a>
<a href="#reset-remaining" onclick="opener.resetRemainingTime(); return false;" title="Reset Remaining Time">|&larr;</a>
</form>
</div>
</div>
<h2 id="slide">...</h2>
<div id="notes"></div>
<h2 id="next">...</h2>
<div id="nextnotes"></div>
</body>
</html>

5
vendor/plugins/HTML5lib/History.txt vendored Normal file
View file

@ -0,0 +1,5 @@
== 0.1.0 / 2007-08-07
* 1 major enhancement
* Birthday!

59
vendor/plugins/HTML5lib/Manifest.txt vendored Normal file
View file

@ -0,0 +1,59 @@
History.txt
Manifest.txt
README
Rakefile.rb
lib/html5.rb
lib/html5/constants.rb
lib/html5/filters/base.rb
lib/html5/filters/inject_meta_charset.rb
lib/html5/filters/optionaltags.rb
lib/html5/filters/sanitizer.rb
lib/html5/filters/whitespace.rb
lib/html5/html5parser.rb
lib/html5/html5parser/after_body_phase.rb
lib/html5/html5parser/after_frameset_phase.rb
lib/html5/html5parser/after_head_phase.rb
lib/html5/html5parser/before_head_phase.rb
lib/html5/html5parser/in_body_phase.rb
lib/html5/html5parser/in_caption_phase.rb
lib/html5/html5parser/in_cell_phase.rb
lib/html5/html5parser/in_column_group_phase.rb
lib/html5/html5parser/in_frameset_phase.rb
lib/html5/html5parser/in_head_phase.rb
lib/html5/html5parser/in_row_phase.rb
lib/html5/html5parser/in_select_phase.rb
lib/html5/html5parser/in_table_body_phase.rb
lib/html5/html5parser/in_table_phase.rb
lib/html5/html5parser/initial_phase.rb
lib/html5/html5parser/phase.rb
lib/html5/html5parser/root_element_phase.rb
lib/html5/html5parser/trailing_end_phase.rb
lib/html5/inputstream.rb
lib/html5/liberalxmlparser.rb
lib/html5/sanitizer.rb
lib/html5/serializer.rb
lib/html5/serializer/htmlserializer.rb
lib/html5/serializer/xhtmlserializer.rb
lib/html5/tokenizer.rb
lib/html5/treebuilders.rb
lib/html5/treebuilders/base.rb
lib/html5/treebuilders/hpricot.rb
lib/html5/treebuilders/rexml.rb
lib/html5/treebuilders/simpletree.rb
lib/html5/treewalkers.rb
lib/html5/treewalkers/base.rb
lib/html5/treewalkers/hpricot.rb
lib/html5/treewalkers/rexml.rb
lib/html5/treewalkers/simpletree.rb
lib/html5/version.rb
parse.rb
tests/preamble.rb
tests/test_encoding.rb
tests/test_lxp.rb
tests/test_parser.rb
tests/test_sanitizer.rb
tests/test_serializer.rb
tests/test_stream.rb
tests/test_tokenizer.rb
tests/test_treewalkers.rb
tests/tokenizer_test_parser.rb

View file

@ -1,9 +1,45 @@
= HTML5lib html5
by Ryan King, et al
http://code.google.com/p/html5lib
== Basic Usage == DESCRIPTION:
require 'html5lib' A ruby implementation of the parsing algorithm in HTML5.
doc = HTML5lib.parse('<html>...</html>')
doc.class # REXML::Document == FEATURES/PROBLEMS:
== SYNOPSIS:
TODO
== REQUIREMENTS:
* chardet, only tested with 0.9.0
== INSTALL:
* sudo gem install html5
== LICENSE:
Copyright (c) 2006-2007 The Authors
Contributers:
James Graham - jg307@cam.ac.uk
Anne van Kesteren - annevankesteren@gmail.com
Lachlan Hunt - lachlan.hunt@lachy.id.au
Matt McDonald - kanashii@kanashii.ca
Sam Ruby - rubys@intertwingly.net
Ian Hickson (Google) - ian@hixie.ch
Thomas Broyer - t.broyer@ltgt.net
Jacques Distler - distler@golem.ph.utexas.edu
Ryan King - ryan@theryanking.com
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View file

@ -1,7 +1,33 @@
require 'rake' require 'rake'
require 'rake/testtask' require 'hoe'
require 'lib/html5/version'
Rake::TestTask.new do |task| Hoe.new("html5", HTML5::VERSION) do |p|
task.pattern = 'tests/test_*.rb' p.name = "html5"
task.verbose = true p.description = p.paragraphs_of('README', 2..5).join("\n\n")
p.summary = "HTML5 parser/tokenizer."
p.author = ['Ryan King'] # TODO: add more names
p.email = 'ryan@theryanking.com'
p.url = 'http://code.google.com/p/html5lib'
p.need_zip = true
p.extra_deps << ['chardet', '>= 0.9.0']
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
end
require 'rcov/rcovtask'
namespace :test do
namespace :coverage do
desc "Delete aggregate coverage data."
task(:clean) { rm_f "coverage.data" }
end
desc 'Aggregate code coverage for unit, functional and integration tests'
Rcov::RcovTask.new(:coverage => "test:coverage:clean") do |t|
t.libs << "tests"
t.test_files = FileList["tests/test_*.rb"]
t.output_dir = "tests/coverage/"
t.verbose = true
end
end end

215
vendor/plugins/HTML5lib/bin/html5 vendored Executable file
View file

@ -0,0 +1,215 @@
#!/usr/bin/env ruby
$:.unshift File.dirname(__FILE__), 'lib'
def parse(opts, args)
encoding = nil
f = args[-1]
if f
begin
if f[0..6] == 'http://'
require 'open-uri'
f = URI.parse(f).open
encoding = f.charset
elsif f == '-'
f = $stdin
else
f = open(f)
end
rescue
end
else
$stderr.write("No filename provided. Use -h for help\n")
exit(1)
end
require 'html5/treebuilders'
treebuilder = HTML5::TreeBuilders[opts.treebuilder]
if opts.output == :xml
require 'html5/liberalxmlparser'
p = HTML5::XMLParser.new(:tree=>treebuilder)
else
require 'html5/html5parser'
p = HTML5::HTMLParser.new(:tree=>treebuilder)
end
if opts.parsemethod == :parse
args = [f, encoding]
else
args = [f, 'div', encoding]
end
if opts.profile
require 'profiler'
Profiler__::start_profile
p.send(opts.parsemethod, *args)
Profiler__::stop_profile
Profiler__::print_profile($stderr)
elsif opts.time
require 'time' # TODO: switch to benchmark
t0 = Time.new
document = p.send(opts.parsemethod, *args)
t1 = Time.new
print_output(p, document, opts)
t2 = Time.new
puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
else
document = p.send(opts.parsemethod, *args)
print_output(p, document, opts)
end
end
def print_output(parser, document, opts)
puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
case opts.output
when :xml
print document
when :html
require 'html5/treewalkers'
tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
require 'html5/serializer'
puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
when :hilite
print document.hilite
when :tree
document = [document] unless document.respond_to?(:each)
document.each {|fragment| puts parser.tree.testSerializer(fragment)}
end
if opts.error
errList=[]
for pos, message in parser.errors
errList << ("Line %i Col %i"%pos + " " + message)
end
$stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
end
end
require 'ostruct'
options = OpenStruct.new
options.profile = false
options.time = false
options.output = :html
options.treebuilder = 'simpletree'
options.error = false
options.encoding = false
options.parsemethod = :parse
options.serializer = {
:encoding => 'utf-8',
:omit_optional_tags => false,
:inject_meta_charset => false
}
require 'optparse'
opts = OptionParser.new do |opts|
opts.separator ""
opts.separator "Parse Options:"
opts.on("-b", "--treebuilder NAME") do |treebuilder|
options.treebuilder = treebuilder
end
opts.on("-f", "--fragment", "Parse as a fragment") do |parse|
options.parsemethod = :parse_fragment
end
opts.separator ""
opts.separator "Filter Options:"
opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
options.serializer[:inject_meta_charset] = inject
end
opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
options.serializer[:strip_whitespace] = strip
end
opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
options.serializer[:sanitize] = sanitize
end
opts.separator ""
opts.separator "Output Options:"
opts.on("--tree", "output as debug tree") do |tree|
options.output = :tree
end
opts.on("-x", "--xml", "output as xml") do |xml|
options.output = :xml
options.treebuilder = "rexml"
end
opts.on("--[no-]html", "Output as html") do |html|
options.output = (html ? :html : nil)
end
opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
options.output = :hilite
end
opts.on("-e", "--error", "Print a list of parse errors") do |error|
options.error = error
end
opts.separator ""
opts.separator "Serialization Options:"
opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
options.serializer[:omit_optional_tags] = omit
end
opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
options.serializer[:quote_attr_values] = quote
end
opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
options.serializer[:use_best_quote_char] = best
end
opts.on("--quote-char C", "Use specified quote character") do |c|
options.serializer[:quote_char] = c
end
opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
options.serializer[:minimize_boolean_attributes] = min
end
opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
options.serializer[:use_trailing_solidus] = slash
end
opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
options.serializer[:escape_lt_in_attrs] = lt
end
opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
options.serializer[:escape_rcdata] = rcdata
end
opts.separator ""
opts.separator "Other Options:"
opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
options.profile = profile
end
opts.on("-t", "--[no-]time", "Time the run") do |time|
options.time = time
end
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
options.encoding = encoding
end
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
end
opts.parse!(ARGV)
parse options, ARGV

13
vendor/plugins/HTML5lib/lib/html5.rb vendored Normal file
View file

@ -0,0 +1,13 @@
require 'html5/html5parser'
require 'html5/version'
module HTML5
def self.parse(stream, options={})
HTMLParser.parse(stream, options)
end
def self.parse_fragment(stream, options={})
HTMLParser.parse(stream, options)
end
end

818
vendor/plugins/HTML5lib/lib/html5/constants.rb vendored Executable file
View file

@ -0,0 +1,818 @@
module HTML5
class EOF < Exception; end
CONTENT_MODEL_FLAGS = [
:PCDATA,
:RCDATA,
:CDATA,
:PLAINTEXT
]
SCOPING_ELEMENTS = %w[
button
caption
html
marquee
object
table
td
th
]
FORMATTING_ELEMENTS = %w[
a
b
big
em
font
i
nobr
s
small
strike
strong
tt
u
]
SPECIAL_ELEMENTS = %w[
address
area
base
basefont
bgsound
blockquote
body
br
center
col
colgroup
dd
dir
div
dl
dt
embed
fieldset
form
frame
frameset
h1
h2
h3
h4
h5
h6
head
hr
iframe
image
img
input
isindex
li
link
listing
menu
meta
noembed
noframes
noscript
ol
optgroup
option
p
param
plaintext
pre
script
select
spacer
style
tbody
textarea
tfoot
thead
title
tr
ul
wbr
]
SPACE_CHARACTERS = %W[
\t
\n
\x0B
\x0C
\x20
\r
]
TABLE_INSERT_MODE_ELEMENTS = %w[
table
tbody
tfoot
thead
tr
]
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
DIGITS = '0'..'9'
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
# Heading elements need to be ordered
HEADING_ELEMENTS = %w[
h1
h2
h3
h4
h5
h6
]
# XXX What about event-source and command?
VOID_ELEMENTS = %w[
base
link
meta
hr
br
img
embed
param
area
col
input
]
CDATA_ELEMENTS = %w[title textarea]
RCDATA_ELEMENTS = %w[
style
script
xmp
iframe
noembed
noframes
noscript
]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],
'img' => %w[ismap],
'audio' => %w[autoplay controls],
'video' => %w[autoplay controls],
'script' => %w[defer async],
'details' => %w[open],
'datagrid' => %w[multiple disabled],
'command' => %w[hidden disabled checked default],
'menu' => %w[autosubmit],
'fieldset' => %w[disabled readonly],
'option' => %w[disabled readonly selected],
'optgroup' => %w[disabled readonly],
'button' => %w[disabled autofocus],
'input' => %w[disabled readonly required autofocus checked ismap],
'select' => %w[disabled readonly autofocus multiple],
'output' => %w[disabled readonly]
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN
65533, # 0x81 UNDEFINED
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
8224, # 0x86 0x2020 DAGGER
8225, # 0x87 0x2021 DOUBLE DAGGER
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
8240, # 0x89 0x2030 PER MILLE SIGN
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
65533, # 0x8D UNDEFINED
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
65533, # 0x8F UNDEFINED
65533, # 0x90 UNDEFINED
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
8226, # 0x95 0x2022 BULLET
8211, # 0x96 0x2013 EN DASH
8212, # 0x97 0x2014 EM DASH
732, # 0x98 0x02DC SMALL TILDE
8482, # 0x99 0x2122 TRADE MARK SIGN
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
65533, # 0x9D UNDEFINED
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
]
# ENTITIES was generated from Python using the following code:
#
# import constants
# entities = constants.entities.items()
# entities.sort()
# list = [ ' '.join([repr(entity), '=>', ord(value)<128 and
# repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
# for entity, value in entities]
# print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }'
ENTITIES = {
'AElig' => "\xc3\x86",
'AElig;' => "\xc3\x86",
'AMP' => '&',
'AMP;' => '&',
'Aacute' => "\xc3\x81",
'Aacute;' => "\xc3\x81",
'Acirc' => "\xc3\x82",
'Acirc;' => "\xc3\x82",
'Agrave' => "\xc3\x80",
'Agrave;' => "\xc3\x80",
'Alpha;' => "\xce\x91",
'Aring' => "\xc3\x85",
'Aring;' => "\xc3\x85",
'Atilde' => "\xc3\x83",
'Atilde;' => "\xc3\x83",
'Auml' => "\xc3\x84",
'Auml;' => "\xc3\x84",
'Beta;' => "\xce\x92",
'COPY' => "\xc2\xa9",
'COPY;' => "\xc2\xa9",
'Ccedil' => "\xc3\x87",
'Ccedil;' => "\xc3\x87",
'Chi;' => "\xce\xa7",
'Dagger;' => "\xe2\x80\xa1",
'Delta;' => "\xce\x94",
'ETH' => "\xc3\x90",
'ETH;' => "\xc3\x90",
'Eacute' => "\xc3\x89",
'Eacute;' => "\xc3\x89",
'Ecirc' => "\xc3\x8a",
'Ecirc;' => "\xc3\x8a",
'Egrave' => "\xc3\x88",
'Egrave;' => "\xc3\x88",
'Epsilon;' => "\xce\x95",
'Eta;' => "\xce\x97",
'Euml' => "\xc3\x8b",
'Euml;' => "\xc3\x8b",
'GT' => '>',
'GT;' => '>',
'Gamma;' => "\xce\x93",
'Iacute' => "\xc3\x8d",
'Iacute;' => "\xc3\x8d",
'Icirc' => "\xc3\x8e",
'Icirc;' => "\xc3\x8e",
'Igrave' => "\xc3\x8c",
'Igrave;' => "\xc3\x8c",
'Iota;' => "\xce\x99",
'Iuml' => "\xc3\x8f",
'Iuml;' => "\xc3\x8f",
'Kappa;' => "\xce\x9a",
'LT' => '<',
'LT;' => '<',
'Lambda;' => "\xce\x9b",
'Mu;' => "\xce\x9c",
'Ntilde' => "\xc3\x91",
'Ntilde;' => "\xc3\x91",
'Nu;' => "\xce\x9d",
'OElig;' => "\xc5\x92",
'Oacute' => "\xc3\x93",
'Oacute;' => "\xc3\x93",
'Ocirc' => "\xc3\x94",
'Ocirc;' => "\xc3\x94",
'Ograve' => "\xc3\x92",
'Ograve;' => "\xc3\x92",
'Omega;' => "\xce\xa9",
'Omicron;' => "\xce\x9f",
'Oslash' => "\xc3\x98",
'Oslash;' => "\xc3\x98",
'Otilde' => "\xc3\x95",
'Otilde;' => "\xc3\x95",
'Ouml' => "\xc3\x96",
'Ouml;' => "\xc3\x96",
'Phi;' => "\xce\xa6",
'Pi;' => "\xce\xa0",
'Prime;' => "\xe2\x80\xb3",
'Psi;' => "\xce\xa8",
'QUOT' => '"',
'QUOT;' => '"',
'REG' => "\xc2\xae",
'REG;' => "\xc2\xae",
'Rho;' => "\xce\xa1",
'Scaron;' => "\xc5\xa0",
'Sigma;' => "\xce\xa3",
'THORN' => "\xc3\x9e",
'THORN;' => "\xc3\x9e",
'TRADE;' => "\xe2\x84\xa2",
'Tau;' => "\xce\xa4",
'Theta;' => "\xce\x98",
'Uacute' => "\xc3\x9a",
'Uacute;' => "\xc3\x9a",
'Ucirc' => "\xc3\x9b",
'Ucirc;' => "\xc3\x9b",
'Ugrave' => "\xc3\x99",
'Ugrave;' => "\xc3\x99",
'Upsilon;' => "\xce\xa5",
'Uuml' => "\xc3\x9c",
'Uuml;' => "\xc3\x9c",
'Xi;' => "\xce\x9e",
'Yacute' => "\xc3\x9d",
'Yacute;' => "\xc3\x9d",
'Yuml;' => "\xc5\xb8",
'Zeta;' => "\xce\x96",
'aacute' => "\xc3\xa1",
'aacute;' => "\xc3\xa1",
'acirc' => "\xc3\xa2",
'acirc;' => "\xc3\xa2",
'acute' => "\xc2\xb4",
'acute;' => "\xc2\xb4",
'aelig' => "\xc3\xa6",
'aelig;' => "\xc3\xa6",
'agrave' => "\xc3\xa0",
'agrave;' => "\xc3\xa0",
'alefsym;' => "\xe2\x84\xb5",
'alpha;' => "\xce\xb1",
'amp' => '&',
'amp;' => '&',
'and;' => "\xe2\x88\xa7",
'ang;' => "\xe2\x88\xa0",
'apos;' => "'",
'aring' => "\xc3\xa5",
'aring;' => "\xc3\xa5",
'asymp;' => "\xe2\x89\x88",
'atilde' => "\xc3\xa3",
'atilde;' => "\xc3\xa3",
'auml' => "\xc3\xa4",
'auml;' => "\xc3\xa4",
'bdquo;' => "\xe2\x80\x9e",
'beta;' => "\xce\xb2",
'brvbar' => "\xc2\xa6",
'brvbar;' => "\xc2\xa6",
'bull;' => "\xe2\x80\xa2",
'cap;' => "\xe2\x88\xa9",
'ccedil' => "\xc3\xa7",
'ccedil;' => "\xc3\xa7",
'cedil' => "\xc2\xb8",
'cedil;' => "\xc2\xb8",
'cent' => "\xc2\xa2",
'cent;' => "\xc2\xa2",
'chi;' => "\xcf\x87",
'circ;' => "\xcb\x86",
'clubs;' => "\xe2\x99\xa3",
'cong;' => "\xe2\x89\x85",
'copy' => "\xc2\xa9",
'copy;' => "\xc2\xa9",
'crarr;' => "\xe2\x86\xb5",
'cup;' => "\xe2\x88\xaa",
'curren' => "\xc2\xa4",
'curren;' => "\xc2\xa4",
'dArr;' => "\xe2\x87\x93",
'dagger;' => "\xe2\x80\xa0",
'darr;' => "\xe2\x86\x93",
'deg' => "\xc2\xb0",
'deg;' => "\xc2\xb0",
'delta;' => "\xce\xb4",
'diams;' => "\xe2\x99\xa6",
'divide' => "\xc3\xb7",
'divide;' => "\xc3\xb7",
'eacute' => "\xc3\xa9",
'eacute;' => "\xc3\xa9",
'ecirc' => "\xc3\xaa",
'ecirc;' => "\xc3\xaa",
'egrave' => "\xc3\xa8",
'egrave;' => "\xc3\xa8",
'empty;' => "\xe2\x88\x85",
'emsp;' => "\xe2\x80\x83",
'ensp;' => "\xe2\x80\x82",
'epsilon;' => "\xce\xb5",
'equiv;' => "\xe2\x89\xa1",
'eta;' => "\xce\xb7",
'eth' => "\xc3\xb0",
'eth;' => "\xc3\xb0",
'euml' => "\xc3\xab",
'euml;' => "\xc3\xab",
'euro;' => "\xe2\x82\xac",
'exist;' => "\xe2\x88\x83",
'fnof;' => "\xc6\x92",
'forall;' => "\xe2\x88\x80",
'frac12' => "\xc2\xbd",
'frac12;' => "\xc2\xbd",
'frac14' => "\xc2\xbc",
'frac14;' => "\xc2\xbc",
'frac34' => "\xc2\xbe",
'frac34;' => "\xc2\xbe",
'frasl;' => "\xe2\x81\x84",
'gamma;' => "\xce\xb3",
'ge;' => "\xe2\x89\xa5",
'gt' => '>',
'gt;' => '>',
'hArr;' => "\xe2\x87\x94",
'harr;' => "\xe2\x86\x94",
'hearts;' => "\xe2\x99\xa5",
'hellip;' => "\xe2\x80\xa6",
'iacute' => "\xc3\xad",
'iacute;' => "\xc3\xad",
'icirc' => "\xc3\xae",
'icirc;' => "\xc3\xae",
'iexcl' => "\xc2\xa1",
'iexcl;' => "\xc2\xa1",
'igrave' => "\xc3\xac",
'igrave;' => "\xc3\xac",
'image;' => "\xe2\x84\x91",
'infin;' => "\xe2\x88\x9e",
'int;' => "\xe2\x88\xab",
'iota;' => "\xce\xb9",
'iquest' => "\xc2\xbf",
'iquest;' => "\xc2\xbf",
'isin;' => "\xe2\x88\x88",
'iuml' => "\xc3\xaf",
'iuml;' => "\xc3\xaf",
'kappa;' => "\xce\xba",
'lArr;' => "\xe2\x87\x90",
'lambda;' => "\xce\xbb",
'lang;' => "\xe3\x80\x88",
'laquo' => "\xc2\xab",
'laquo;' => "\xc2\xab",
'larr;' => "\xe2\x86\x90",
'lceil;' => "\xe2\x8c\x88",
'ldquo;' => "\xe2\x80\x9c",
'le;' => "\xe2\x89\xa4",
'lfloor;' => "\xe2\x8c\x8a",
'lowast;' => "\xe2\x88\x97",
'loz;' => "\xe2\x97\x8a",
'lrm;' => "\xe2\x80\x8e",
'lsaquo;' => "\xe2\x80\xb9",
'lsquo;' => "\xe2\x80\x98",
'lt' => '<',
'lt;' => '<',
'macr' => "\xc2\xaf",
'macr;' => "\xc2\xaf",
'mdash;' => "\xe2\x80\x94",
'micro' => "\xc2\xb5",
'micro;' => "\xc2\xb5",
'middot' => "\xc2\xb7",
'middot;' => "\xc2\xb7",
'minus;' => "\xe2\x88\x92",
'mu;' => "\xce\xbc",
'nabla;' => "\xe2\x88\x87",
'nbsp' => "\xc2\xa0",
'nbsp;' => "\xc2\xa0",
'ndash;' => "\xe2\x80\x93",
'ne;' => "\xe2\x89\xa0",
'ni;' => "\xe2\x88\x8b",
'not' => "\xc2\xac",
'not;' => "\xc2\xac",
'notin;' => "\xe2\x88\x89",
'nsub;' => "\xe2\x8a\x84",
'ntilde' => "\xc3\xb1",
'ntilde;' => "\xc3\xb1",
'nu;' => "\xce\xbd",
'oacute' => "\xc3\xb3",
'oacute;' => "\xc3\xb3",
'ocirc' => "\xc3\xb4",
'ocirc;' => "\xc3\xb4",
'oelig;' => "\xc5\x93",
'ograve' => "\xc3\xb2",
'ograve;' => "\xc3\xb2",
'oline;' => "\xe2\x80\xbe",
'omega;' => "\xcf\x89",
'omicron;' => "\xce\xbf",
'oplus;' => "\xe2\x8a\x95",
'or;' => "\xe2\x88\xa8",
'ordf' => "\xc2\xaa",
'ordf;' => "\xc2\xaa",
'ordm' => "\xc2\xba",
'ordm;' => "\xc2\xba",
'oslash' => "\xc3\xb8",
'oslash;' => "\xc3\xb8",
'otilde' => "\xc3\xb5",
'otilde;' => "\xc3\xb5",
'otimes;' => "\xe2\x8a\x97",
'ouml' => "\xc3\xb6",
'ouml;' => "\xc3\xb6",
'para' => "\xc2\xb6",
'para;' => "\xc2\xb6",
'part;' => "\xe2\x88\x82",
'permil;' => "\xe2\x80\xb0",
'perp;' => "\xe2\x8a\xa5",
'phi;' => "\xcf\x86",
'pi;' => "\xcf\x80",
'piv;' => "\xcf\x96",
'plusmn' => "\xc2\xb1",
'plusmn;' => "\xc2\xb1",
'pound' => "\xc2\xa3",
'pound;' => "\xc2\xa3",
'prime;' => "\xe2\x80\xb2",
'prod;' => "\xe2\x88\x8f",
'prop;' => "\xe2\x88\x9d",
'psi;' => "\xcf\x88",
'quot' => '"',
'quot;' => '"',
'rArr;' => "\xe2\x87\x92",
'radic;' => "\xe2\x88\x9a",
'rang;' => "\xe3\x80\x89",
'raquo' => "\xc2\xbb",
'raquo;' => "\xc2\xbb",
'rarr;' => "\xe2\x86\x92",
'rceil;' => "\xe2\x8c\x89",
'rdquo;' => "\xe2\x80\x9d",
'real;' => "\xe2\x84\x9c",
'reg' => "\xc2\xae",
'reg;' => "\xc2\xae",
'rfloor;' => "\xe2\x8c\x8b",
'rho;' => "\xcf\x81",
'rlm;' => "\xe2\x80\x8f",
'rsaquo;' => "\xe2\x80\xba",
'rsquo;' => "\xe2\x80\x99",
'sbquo;' => "\xe2\x80\x9a",
'scaron;' => "\xc5\xa1",
'sdot;' => "\xe2\x8b\x85",
'sect' => "\xc2\xa7",
'sect;' => "\xc2\xa7",
'shy' => "\xc2\xad",
'shy;' => "\xc2\xad",
'sigma;' => "\xcf\x83",
'sigmaf;' => "\xcf\x82",
'sim;' => "\xe2\x88\xbc",
'spades;' => "\xe2\x99\xa0",
'sub;' => "\xe2\x8a\x82",
'sube;' => "\xe2\x8a\x86",
'sum;' => "\xe2\x88\x91",
'sup1' => "\xc2\xb9",
'sup1;' => "\xc2\xb9",
'sup2' => "\xc2\xb2",
'sup2;' => "\xc2\xb2",
'sup3' => "\xc2\xb3",
'sup3;' => "\xc2\xb3",
'sup;' => "\xe2\x8a\x83",
'supe;' => "\xe2\x8a\x87",
'szlig' => "\xc3\x9f",
'szlig;' => "\xc3\x9f",
'tau;' => "\xcf\x84",
'there4;' => "\xe2\x88\xb4",
'theta;' => "\xce\xb8",
'thetasym;' => "\xcf\x91",
'thinsp;' => "\xe2\x80\x89",
'thorn' => "\xc3\xbe",
'thorn;' => "\xc3\xbe",
'tilde;' => "\xcb\x9c",
'times' => "\xc3\x97",
'times;' => "\xc3\x97",
'trade;' => "\xe2\x84\xa2",
'uArr;' => "\xe2\x87\x91",
'uacute' => "\xc3\xba",
'uacute;' => "\xc3\xba",
'uarr;' => "\xe2\x86\x91",
'ucirc' => "\xc3\xbb",
'ucirc;' => "\xc3\xbb",
'ugrave' => "\xc3\xb9",
'ugrave;' => "\xc3\xb9",
'uml' => "\xc2\xa8",
'uml;' => "\xc2\xa8",
'upsih;' => "\xcf\x92",
'upsilon;' => "\xcf\x85",
'uuml' => "\xc3\xbc",
'uuml;' => "\xc3\xbc",
'weierp;' => "\xe2\x84\x98",
'xi;' => "\xce\xbe",
'yacute' => "\xc3\xbd",
'yacute;' => "\xc3\xbd",
'yen' => "\xc2\xa5",
'yen;' => "\xc2\xa5",
'yuml' => "\xc3\xbf",
'yuml;' => "\xc3\xbf",
'zeta;' => "\xce\xb6",
'zwj;' => "\xe2\x80\x8d",
'zwnj;' => "\xe2\x80\x8c"
}
ENCODINGS = %w[
ansi_x3.4-1968
iso-ir-6
ansi_x3.4-1986
iso_646.irv:1991
ascii
iso646-us
us-ascii
us
ibm367
cp367
csascii
ks_c_5601-1987
korean
iso-2022-kr
csiso2022kr
euc-kr
iso-2022-jp
csiso2022jp
iso-2022-jp-2
iso-ir-58
chinese
csiso58gb231280
iso_8859-1:1987
iso-ir-100
iso_8859-1
iso-8859-1
latin1
l1
ibm819
cp819
csisolatin1
iso_8859-2:1987
iso-ir-101
iso_8859-2
iso-8859-2
latin2
l2
csisolatin2
iso_8859-3:1988
iso-ir-109
iso_8859-3
iso-8859-3
latin3
l3
csisolatin3
iso_8859-4:1988
iso-ir-110
iso_8859-4
iso-8859-4
latin4
l4
csisolatin4
iso_8859-6:1987
iso-ir-127
iso_8859-6
iso-8859-6
ecma-114
asmo-708
arabic
csisolatinarabic
iso_8859-7:1987
iso-ir-126
iso_8859-7
iso-8859-7
elot_928
ecma-118
greek
greek8
csisolatingreek
iso_8859-8:1988
iso-ir-138
iso_8859-8
iso-8859-8
hebrew
csisolatinhebrew
iso_8859-5:1988
iso-ir-144
iso_8859-5
iso-8859-5
cyrillic
csisolatincyrillic
iso_8859-9:1989
iso-ir-148
iso_8859-9
iso-8859-9
latin5
l5
csisolatin5
iso-8859-10
iso-ir-157
l6
iso_8859-10:1992
csisolatin6
latin6
hp-roman8
roman8
r8
ibm037
cp037
csibm037
ibm424
cp424
csibm424
ibm437
cp437
437
cspc8codepage437
ibm500
cp500
csibm500
ibm775
cp775
cspc775baltic
ibm850
cp850
850
cspc850multilingual
ibm852
cp852
852
cspcp852
ibm855
cp855
855
csibm855
ibm857
cp857
857
csibm857
ibm860
cp860
860
csibm860
ibm861
cp861
861
cp-is
csibm861
ibm862
cp862
862
cspc862latinhebrew
ibm863
cp863
863
csibm863
ibm864
cp864
csibm864
ibm865
cp865
865
csibm865
ibm866
cp866
866
csibm866
ibm869
cp869
869
cp-gr
csibm869
ibm1026
cp1026
csibm1026
koi8-r
cskoi8r
koi8-u
big5-hkscs
ptcp154
csptcp154
pt154
cp154
utf-7
utf-16be
utf-16le
utf-16
utf-8
iso-8859-13
iso-8859-14
iso-ir-199
iso_8859-14:1998
iso_8859-14
latin8
iso-celtic
l8
iso-8859-15
iso_8859-15
iso-8859-16
iso-ir-226
iso_8859-16:2001
iso_8859-16
latin10
l10
gbk
cp936
ms936
gb18030
shift_jis
ms_kanji
csshiftjis
euc-jp
gb2312
big5
csbig5
windows-1250
windows-1251
windows-1252
windows-1253
windows-1254
windows-1255
windows-1256
windows-1257
windows-1258
tis-620
hz-gb-2312
]
end

View file

@ -0,0 +1 @@
require 'html5/filters/optionaltags'

View file

@ -1,7 +1,7 @@
require 'delegate' require 'delegate'
require 'enumerator' require 'enumerator'
module HTML5lib module HTML5
module Filters module Filters
class Base < SimpleDelegator class Base < SimpleDelegator
include Enumerable include Enumerable

View file

@ -1,6 +1,6 @@
require 'html5lib/filters/base' require 'html5/filters/base'
module HTML5lib module HTML5
module Filters module Filters
class InjectMetaCharset < Base class InjectMetaCharset < Base
def initialize(source, encoding) def initialize(source, encoding)
@ -21,9 +21,9 @@ module HTML5lib
when :EmptyTag when :EmptyTag
if token[:name].downcase == "meta" if token[:name].downcase == "meta"
# replace charset with actual encoding # replace charset with actual encoding
token[:data].each_with_index do |(name,value),index| token[:data].each_with_index do |(name, value), index|
if name == 'charset' if name == 'charset'
token[:data][index][1]=@encoding token[:data][index][1] = @encoding
meta_found = true meta_found = true
end end
end end
@ -31,7 +31,7 @@ module HTML5lib
# replace charset with actual encoding # replace charset with actual encoding
has_http_equiv_content_type = false has_http_equiv_content_type = false
content_index = -1 content_index = -1
token[:data].each_with_index do |(name,value),i| token[:data].each_with_index do |(name, value), i|
if name.downcase == 'charset' if name.downcase == 'charset'
token[:data][i] = ['charset', @encoding] token[:data][i] = ['charset', @encoding]
meta_found = true meta_found = true
@ -43,30 +43,27 @@ module HTML5lib
end end
end end
if not meta_found if !meta_found
if has_http_equiv_content_type and content_index >= 0 if has_http_equiv_content_type && content_index >= 0
token[:data][content_index][1] = token[:data][content_index][1] = 'text/html; charset=%s' % @encoding
'text/html; charset=%s' % @encoding
meta_found = true meta_found = true
end end
end end
elsif token[:name].downcase == "head" and not meta_found elsif token[:name].downcase == "head" && !meta_found
# insert meta into empty head # insert meta into empty head
yield(:type => :StartTag, :name => "head", :data => token[:data]) yield :type => :StartTag, :name => "head", :data => token[:data]
yield(:type => :EmptyTag, :name => "meta", yield :type => :EmptyTag, :name => "meta", :data => [["charset", @encoding]]
:data => [["charset", @encoding]]) yield :type => :EndTag, :name => "head"
yield(:type => :EndTag, :name => "head")
meta_found = true meta_found = true
next next
end end
when :EndTag when :EndTag
if token[:name].downcase == "head" and pending.any? if token[:name].downcase == "head" && pending.any?
# insert meta into head (if necessary) and flush pending queue # insert meta into head (if necessary) and flush pending queue
yield pending.shift yield pending.shift
yield(:type => :EmptyTag, :name => "meta", yield :type => :EmptyTag, :name => "meta", :data => [["charset", @encoding]] if !meta_found
:data => [["charset", @encoding]]) if not meta_found
yield pending.shift while pending.any? yield pending.shift while pending.any?
meta_found = true meta_found = true
state = :post_head state = :post_head

View file

@ -1,7 +1,7 @@
require 'html5lib/constants' require 'html5/constants'
require 'html5lib/filters/base' require 'html5/filters/base'
module HTML5lib module HTML5
module Filters module Filters
class OptionalTagFilter < Base class OptionalTagFilter < Base
@ -75,8 +75,7 @@ module HTML5lib
if type == :StartTag if type == :StartTag
# omit the thead and tfoot elements' end tag when they are # omit the thead and tfoot elements' end tag when they are
# immediately followed by a tbody element. See is_optional_end. # immediately followed by a tbody element. See is_optional_end.
if previous and previous[:type] == :EndTag and \ if previous and previous[:type] == :EndTag && %w(tbody thead tfoot).include?(previous[:name])
%w(tbody thead tfoot).include?(previous[:name])
return false return false
end end

View file

@ -1,7 +1,7 @@
require 'html5lib/filters/base' require 'html5/filters/base'
require 'html5lib/sanitizer' require 'html5/sanitizer'
module HTML5lib module HTML5
module Filters module Filters
class HTMLSanitizeFilter < Base class HTMLSanitizeFilter < Base
include HTMLSanitizeModule include HTMLSanitizeModule

View file

@ -1,7 +1,7 @@
require 'html5lib/constants' require 'html5/constants'
require 'html5lib/filters/base' require 'html5/filters/base'
module HTML5lib module HTML5
module Filters module Filters
class WhitespaceFilter < Base class WhitespaceFilter < Base
@ -21,7 +21,7 @@ module HTML5lib
preserve -= 1 if preserve > 0 preserve -= 1 if preserve > 0
when :SpaceCharacters when :SpaceCharacters
next if preserve == 0 token[:data] = " " if preserve == 0 && token[:data]
when :Characters when :Characters
token[:data] = token[:data].sub(SPACES,' ') if preserve == 0 token[:data] = token[:data].sub(SPACES,' ') if preserve == 0

View file

@ -1,12 +1,12 @@
require 'html5lib/constants' require 'html5/constants'
require 'html5lib/tokenizer' require 'html5/tokenizer'
require 'html5lib/treebuilders/rexml' require 'html5/treebuilders/rexml'
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path| Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
require 'html5lib/html5parser/' + File.basename(path) require 'html5/html5parser/' + File.basename(path)
end end
module HTML5lib module HTML5
# Error in parsed document # Error in parsed document
class ParseError < Exception; end class ParseError < Exception; end
@ -16,7 +16,7 @@ module HTML5lib
# #
class HTMLParser class HTMLParser
attr_accessor :phase, :firstStartTag, :innerHTML, :lastPhase, :insertFromTable attr_accessor :phase, :first_start_tag, :inner_html, :last_phase, :insert_from_table
attr_reader :phases, :tokenizer, :tree, :errors attr_reader :phases, :tokenizer, :tree, :errors
@ -25,10 +25,10 @@ module HTML5lib
new(options).parse(stream,encoding) new(options).parse(stream,encoding)
end end
def self.parseFragment(stream, options = {}) def self.parse_fragment(stream, options = {})
container = options.delete(:container) || 'div' container = options.delete(:container) || 'div'
encoding = options.delete(:encoding) encoding = options.delete(:encoding)
new(options).parseFragment(stream,container,encoding) new(options).parse_fragment(stream, container, encoding)
end end
@@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption @@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
@ -37,7 +37,7 @@ module HTML5lib
# :strict - raise an exception when a parse error is encountered # :strict - raise an exception when a parse error is encountered
# :tree - a treebuilder class controlling the type of tree that will be # :tree - a treebuilder class controlling the type of tree that will be
# returned. Built in treebuilders can be accessed through # returned. Built in treebuilders can be accessed through
# HTML5lib::TreeBuilders[treeType] # HTML5::TreeBuilders[treeType]
def initialize(options = {}) def initialize(options = {})
@strict = false @strict = false
@errors = [] @errors = []
@ -45,55 +45,57 @@ module HTML5lib
@tokenizer = HTMLTokenizer @tokenizer = HTMLTokenizer
@tree = TreeBuilders::REXML::TreeBuilder @tree = TreeBuilders::REXML::TreeBuilder
options.each { |name, value| instance_variable_set("@#{name}", value) } options.each {|name, value| instance_variable_set("@#{name}", value) }
@lowercase_attr_name = nil unless instance_variables.include?("@lowercase_attr_name")
@lowercase_element_name = nil unless instance_variables.include?("@lowercase_element_name")
@tree = @tree.new @tree = @tree.new
@phases = @@phases.inject({}) do |phases, phase_name| @phases = @@phases.inject({}) do |phases, phase_name|
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase' phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
phases[phase_name.to_sym] = HTML5lib.const_get(phase_class_name).new(self, @tree) phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
phases phases
end end
end end
def _parse(stream, innerHTML, encoding, container = 'div') def _parse(stream, inner_html, encoding, container = 'div')
@tree.reset @tree.reset
@firstStartTag = false @first_start_tag = false
@errors = [] @errors = []
@tokenizer = @tokenizer.class unless Class === @tokenizer @tokenizer = @tokenizer.class unless Class === @tokenizer
@tokenizer = @tokenizer.new(stream, :encoding => encoding, @tokenizer = @tokenizer.new(stream, :encoding => encoding,
:parseMeta => !innerHTML) :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)
if innerHTML if inner_html
case @innerHTML = container.downcase case @inner_html = container.downcase
when 'title', 'textarea' when 'title', 'textarea'
@tokenizer.contentModelFlag = :RCDATA @tokenizer.content_model_flag = :RCDATA
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript' when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
@tokenizer.contentModelFlag = :CDATA @tokenizer.content_model_flag = :CDATA
when 'plaintext' when 'plaintext'
@tokenizer.contentModelFlag = :PLAINTEXT @tokenizer.content_model_flag = :PLAINTEXT
else else
# contentModelFlag already is PCDATA # content_model_flag already is PCDATA
#@tokenizer.contentModelFlag = :PCDATA #@tokenizer.content_model_flag = :PCDATA
end end
@phase = @phases[:rootElement] @phase = @phases[:rootElement]
@phase.insertHtmlElement @phase.insert_html_element
resetInsertionMode reset_insertion_mode
else else
@innerHTML = false @inner_html = false
@phase = @phases[:initial] @phase = @phases[:initial]
end end
# We only seem to have InBodyPhase testcases where the following is # We only seem to have InBodyPhase testcases where the following is
# relevant ... need others too # relevant ... need others too
@lastPhase = nil @last_phase = nil
# XXX This is temporary for the moment so there isn't any other # XXX This is temporary for the moment so there isn't any other
# changes needed for the parser to work with the iterable tokenizer # changes needed for the parser to work with the iterable tokenizer
@tokenizer.each do |token| @tokenizer.each do |token|
token = normalizeToken(token) token = normalize_token(token)
method = 'process%s' % token[:type] method = 'process%s' % token[:type]
@ -108,12 +110,12 @@ module HTML5lib
@phase.send method, token[:name], token[:publicId], @phase.send method, token[:name], token[:publicId],
token[:systemId], token[:correct] token[:systemId], token[:correct]
else else
parseError(token[:data]) parse_error(token[:data])
end end
end end
# When the loop finishes it's EOF # When the loop finishes it's EOF
@phase.processEOF @phase.process_eof
end end
# Parse a HTML document into a well-formed tree # Parse a HTML document into a well-formed tree
@ -126,12 +128,12 @@ module HTML5lib
# element) # element)
def parse(stream, encoding=nil) def parse(stream, encoding=nil)
_parse(stream, false, encoding) _parse(stream, false, encoding)
return @tree.getDocument @tree.get_document
end end
# Parse a HTML fragment into a well-formed tree fragment # Parse a HTML fragment into a well-formed tree fragment
# container - name of the element we're setting the innerHTML property # container - name of the element we're setting the inner_html property
# if set to nil, default to 'div' # if set to nil, default to 'div'
# #
# stream - a filelike object or string containing the HTML to be parsed # stream - a filelike object or string containing the HTML to be parsed
@ -140,19 +142,19 @@ module HTML5lib
# the encoding. If specified, that encoding will be used, # the encoding. If specified, that encoding will be used,
# regardless of any BOM or later declaration (such as in a meta # regardless of any BOM or later declaration (such as in a meta
# element) # element)
def parseFragment(stream, container='div', encoding=nil) def parse_fragment(stream, container='div', encoding=nil)
_parse(stream, true, encoding, container) _parse(stream, true, encoding, container)
return @tree.getFragment @tree.get_fragment
end end
def parseError(data = 'XXX ERROR MESSAGE NEEDED') def parse_error(data = 'XXX ERROR MESSAGE NEEDED')
# XXX The idea is to make data mandatory. # XXX The idea is to make data mandatory.
@errors.push([@tokenizer.stream.position, data]) @errors.push([@tokenizer.stream.position, data])
raise ParseError if @strict raise ParseError if @strict
end end
# HTML5 specific normalizations to the token stream # HTML5 specific normalizations to the token stream
def normalizeToken(token) def normalize_token(token)
if token[:type] == :EmptyTag if token[:type] == :EmptyTag
# When a solidus (/) is encountered within a tag name what happens # When a solidus (/) is encountered within a tag name what happens
@ -161,29 +163,29 @@ module HTML5lib
# thing and if it doesn't it's wrong for everyone. # thing and if it doesn't it's wrong for everyone.
unless VOID_ELEMENTS.include?(token[:name]) unless VOID_ELEMENTS.include?(token[:name])
parseError(_('Solidus (/) incorrectly placed in tag.')) parse_error(_('Solidus (/) incorrectly placed in tag.'))
end end
token[:type] = :StartTag token[:type] = :StartTag
end end
if token[:type] == :StartTag if token[:type] == :StartTag
token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE) token[:name] = token[:name].downcase
# We need to remove the duplicate attributes and convert attributes # We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
unless token[:data].empty? unless token[:data].empty?
data = token[:data].reverse.map { |attr, value| [attr.tr(ASCII_UPPERCASE, ASCII_LOWERCASE), value] } data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
token[:data] = Hash[*data.flatten] token[:data] = Hash[*data.flatten]
end end
elsif token[:type] == :EndTag elsif token[:type] == :EndTag
parseError(_('End tag contains unexpected attributes.')) unless token[:data].empty? parse_error(_('End tag contains unexpected attributes.')) unless token[:data].empty?
token[:name] = token[:name].downcase token[:name] = token[:name].downcase
end end
return token token
end end
@@new_modes = { @@new_modes = {
@ -202,34 +204,34 @@ module HTML5lib
'frameset' => :inFrameset 'frameset' => :inFrameset
} }
def resetInsertionMode def reset_insertion_mode
# The name of this method is mostly historical. (It's also used in the # The name of this method is mostly historical. (It's also used in the
# specification.) # specification.)
last = false last = false
@tree.openElements.reverse.each do |node| @tree.open_elements.reverse.each do |node|
nodeName = node.name node_name = node.name
if node == @tree.openElements[0] if node == @tree.open_elements.first
last = true last = true
unless ['td', 'th'].include?(nodeName) unless ['td', 'th'].include?(node_name)
# XXX # XXX
# assert @innerHTML # assert @inner_html
nodeName = @innerHTML node_name = @inner_html
end end
end end
# Check for conditions that should only happen in the innerHTML # Check for conditions that should only happen in the inner_html
# case # case
if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName) if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
# XXX # XXX
# assert @innerHTML # assert @inner_html
end end
if @@new_modes.has_key?(nodeName) if @@new_modes.has_key?(node_name)
@phase = @phases[@@new_modes[nodeName]] @phase = @phases[@@new_modes[node_name]]
elsif nodeName == 'html' elsif node_name == 'html'
@phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead] @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
elsif last elsif last
@phase = @phases[:inBody] @phase = @phases[:inBody]
else else

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class AfterBodyPhase < Phase class AfterBodyPhase < Phase
handle_end 'html' handle_end 'html'
@ -8,36 +8,36 @@ module HTML5lib
def processComment(data) def processComment(data)
# This is needed because data is to be appended to the <html> element # This is needed because data is to be appended to the <html> element
# here and not to whatever is currently open. # here and not to whatever is currently open.
@tree.insertComment(data, @tree.openElements[0]) @tree.insert_comment(data, @tree.open_elements.first)
end end
def processCharacters(data) def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after body phase.')) parse_error(_('Unexpected non-space characters in the after body phase.'))
@parser.phase = @parser.phases[:inBody] @parser.phase = @parser.phases[:inBody]
@parser.phase.processCharacters(data) @parser.phase.processCharacters(data)
end end
def processStartTag(name, attributes) def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the after body phase.")) parse_error(_("Unexpected start tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody] @parser.phase = @parser.phases[:inBody]
@parser.phase.processStartTag(name, attributes) @parser.phase.processStartTag(name, attributes)
end end
def endTagHtml(name) def endTagHtml(name)
if @parser.innerHTML if @parser.inner_html
@parser.parseError parse_error
else else
# XXX: This may need to be done, not sure # XXX: This may need to be done, not sure
# Don't set lastPhase to the current phase but to the inBody phase # Don't set last_phase to the current phase but to the inBody phase
# instead. No need for extra parse errors if there's something after </html>. # instead. No need for extra parse errors if there's something after </html>.
# Try "<!doctype html>X</html>X" for instance. # Try "<!doctype html>X</html>X" for instance.
@parser.lastPhase = @parser.phase @parser.last_phase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd] @parser.phase = @parser.phases[:trailingEnd]
end end
end end
def endTagOther(name) def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the after body phase.")) parse_error(_("Unexpected end tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody] @parser.phase = @parser.phases[:inBody]
@parser.phase.processEndTag(name) @parser.phase.processEndTag(name)
end end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class AfterFramesetPhase < Phase class AfterFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#after3 # http://www.whatwg.org/specs/web-apps/current-work/#after3
@ -10,7 +10,7 @@ module HTML5lib
handle_end 'html' handle_end 'html'
def processCharacters(data) def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after frameset phase. Ignored.')) parse_error(_('Unexpected non-space characters in the after frameset phase. Ignored.'))
end end
def startTagNoframes(name, attributes) def startTagNoframes(name, attributes)
@ -18,16 +18,16 @@ module HTML5lib
end end
def startTagOther(name, attributes) def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in the after frameset phase. Ignored.")) parse_error(_("Unexpected start tag (#{name}) in the after frameset phase. Ignored."))
end end
def endTagHtml(name) def endTagHtml(name)
@parser.lastPhase = @parser.phase @parser.last_phase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd] @parser.phase = @parser.phases[:trailingEnd]
end end
def endTagOther(name) def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the after frameset phase. Ignored.")) parse_error(_("Unexpected end tag (#{name}) in the after frameset phase. Ignored."))
end end
end end

View file

@ -1,48 +1,48 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class AfterHeadPhase < Phase class AfterHeadPhase < Phase
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead' handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
def processEOF def process_eof
anythingElse anything_else
@parser.phase.processEOF @parser.phase.process_eof
end end
def processCharacters(data) def processCharacters(data)
anythingElse anything_else
@parser.phase.processCharacters(data) @parser.phase.processCharacters(data)
end end
def startTagBody(name, attributes) def startTagBody(name, attributes)
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inBody] @parser.phase = @parser.phases[:inBody]
end end
def startTagFrameset(name, attributes) def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inFrameset] @parser.phase = @parser.phases[:inFrameset]
end end
def startTagFromHead(name, attributes) def startTagFromHead(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that can be in head. Moved.")) parse_error(_("Unexpected start tag (#{name}) that can be in head. Moved."))
@parser.phase = @parser.phases[:inHead] @parser.phase = @parser.phases[:inHead]
@parser.phase.processStartTag(name, attributes) @parser.phase.processStartTag(name, attributes)
end end
def startTagOther(name, attributes) def startTagOther(name, attributes)
anythingElse anything_else
@parser.phase.processStartTag(name, attributes) @parser.phase.processStartTag(name, attributes)
end end
def processEndTag(name) def processEndTag(name)
anythingElse anything_else
@parser.phase.processEndTag(name) @parser.phase.processEndTag(name)
end end
def anythingElse def anything_else
@tree.insertElement('body', {}) @tree.insert_element('body', {})
@parser.phase = @parser.phases[:inBody] @parser.phase = @parser.phases[:inBody]
end end

View file

@ -1,15 +1,15 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class BeforeHeadPhase < Phase class BeforeHeadPhase < Phase
handle_start 'html', 'head' handle_start 'html', 'head'
handle_end %w( html head body br ) => 'ImplyHead' handle_end %w( html head body br p ) => 'ImplyHead'
def processEOF def process_eof
startTagHead('head', {}) startTagHead('head', {})
@parser.phase.processEOF @parser.phase.process_eof
end end
def processCharacters(data) def processCharacters(data)
@ -18,8 +18,8 @@ module HTML5lib
end end
def startTagHead(name, attributes) def startTagHead(name, attributes)
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@tree.headPointer = @tree.openElements[-1] @tree.head_pointer = @tree.open_elements[-1]
@parser.phase = @parser.phases[:inHead] @parser.phase = @parser.phases[:inHead]
end end
@ -34,7 +34,7 @@ module HTML5lib
end end
def endTagOther(name) def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) after the (implied) root element.")) parse_error(_("Unexpected end tag (#{name}) after the (implied) root element."))
end end
end end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InBodyPhase < Phase class InBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-body # http://www.whatwg.org/specs/web-apps/current-work/#in-body
@ -51,25 +51,40 @@ module HTML5lib
# for special handling of whitespace in <pre> # for special handling of whitespace in <pre>
@processSpaceCharactersDropNewline = false @processSpaceCharactersDropNewline = false
if $-w
$-w = false
alias processSpaceCharactersNonPre processSpaceCharacters
$-w = true
else
alias processSpaceCharactersNonPre processSpaceCharacters
end
end end
def processSpaceCharactersDropNewline(data) def processSpaceCharactersDropNewline(data)
#Sometimes (start of <pre> blocks) we want to drop leading newlines # #Sometimes (start of <pre> blocks) we want to drop leading newlines
@processSpaceCharactersDropNewline = false
if (data.length > 0 and data[0] == ?\n and if $-w
%w[pre textarea].include?(@tree.openElements[-1].name) and $-w = false
not @tree.openElements[-1].hasContent) alias processSpaceCharacters processSpaceCharactersNonPre
$-w = true
else
alias processSpaceCharacters processSpaceCharactersNonPre
end
if (data.length > 0 and data[0] == ?\n &&
%w[pre textarea].include?(@tree.open_elements.last.name) && !@tree.open_elements.last.hasContent)
data = data[1..-1] data = data[1..-1]
end end
@tree.insertText(data) if data.length > 0
if data.length > 0
@tree.reconstructActiveFormattingElements
@tree.insertText(data)
end
end end
def processSpaceCharacters(data) def processSpaceCharacters(data)
if @processSpaceCharactersDropNewline @tree.reconstructActiveFormattingElements()
processSpaceCharactersDropNewline(data) @tree.insertText(data)
else
super(data)
end
end end
def processCharacters(data) def processCharacters(data)
@ -85,20 +100,19 @@ module HTML5lib
end end
def startTagTitle(name, attributes) def startTagTitle(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that belongs in the head. Moved.")) parse_error(_("Unexpected start tag (#{name}) that belongs in the head. Moved."))
@parser.phases[:inHead].processStartTag(name, attributes) @parser.phases[:inHead].processStartTag(name, attributes)
end end
def startTagBody(name, attributes) def startTagBody(name, attributes)
@parser.parseError(_('Unexpected start tag (body).')) parse_error(_('Unexpected start tag (body).'))
if (@tree.openElements.length == 1 or if (@tree.open_elements.length == 1 || @tree.open_elements[1].name != 'body')
@tree.openElements[1].name != 'body') assert @parser.inner_html
assert @parser.innerHTML
else else
attributes.each do |attr, value| attributes.each do |attr, value|
unless @tree.openElements[1].attributes.has_key?(attr) unless @tree.open_elements[1].attributes.has_key?(attr)
@tree.openElements[1].attributes[attr] = value @tree.open_elements[1].attributes[attr] = value
end end
end end
end end
@ -106,17 +120,17 @@ module HTML5lib
def startTagCloseP(name, attributes) def startTagCloseP(name, attributes)
endTagP('p') if in_scope?('p') endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@processSpaceCharactersDropNewline = true if name == 'pre' @processSpaceCharactersDropNewline = true if name == 'pre'
end end
def startTagForm(name, attributes) def startTagForm(name, attributes)
if @tree.formPointer if @tree.formPointer
@parser.parseError('Unexpected start tag (form). Ignored.') parse_error(_('Unexpected start tag (form). Ignored.'))
else else
endTagP('p') if in_scope?('p') endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@tree.formPointer = @tree.openElements[-1] @tree.formPointer = @tree.open_elements[-1]
end end
end end
@ -125,31 +139,28 @@ module HTML5lib
stopNames = {'li' => ['li'], 'dd' => ['dd', 'dt'], 'dt' => ['dd', 'dt']} stopNames = {'li' => ['li'], 'dd' => ['dd', 'dt'], 'dt' => ['dd', 'dt']}
stopName = stopNames[name] stopName = stopNames[name]
@tree.openElements.reverse.each_with_index do |node, i| @tree.open_elements.reverse.each_with_index do |node, i|
if stopName.include?(node.name) if stopName.include?(node.name)
poppedNodes = (0..i).collect { @tree.openElements.pop } poppedNodes = (0..i).collect { @tree.open_elements.pop }
if i >= 1 if i >= 1
@parser.parseError("Missing end tag%s (%s)" % [ parse_error(_("Missing end tag%s (%s)" % [(i>1 ? 's' : ''), poppedNodes.reverse.map{|item| item.name}.join(', ')]))
(i>1 ? 's' : ''),
poppedNodes.reverse.map {|item| item.name}.join(', ')])
end end
break break
end end
# Phrasing elements are all non special, non scoping, non # Phrasing elements are all non special, non scoping, non
# formatting elements # formatting elements
break if ((SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) and break if ((SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) && !%w[address div].include?(node.name))
not ['address', 'div'].include?(node.name))
end end
# Always insert an <li> element. # Always insert an <li> element.
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
end end
def startTagPlaintext(name, attributes) def startTagPlaintext(name, attributes)
endTagP('p') if in_scope?('p') endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@parser.tokenizer.contentModelFlag = :PLAINTEXT @parser.tokenizer.content_model_flag = :PLAINTEXT
end end
def startTagHeading(name, attributes) def startTagHeading(name, attributes)
@ -158,7 +169,7 @@ module HTML5lib
# Uncomment the following for IE7 behavior: # Uncomment the following for IE7 behavior:
# HEADING_ELEMENTS.each do |element| # HEADING_ELEMENTS.each do |element|
# if in_scope?(element) # if in_scope?(element)
# @parser.parseError(_("Unexpected start tag (#{name}).")) # parse_error(_("Unexpected start tag (#{name})."))
# #
# remove_open_elements_until do |element| # remove_open_elements_until do |element|
# HEADING_ELEMENTS.include?(element.name) # HEADING_ELEMENTS.include?(element.name)
@ -167,14 +178,14 @@ module HTML5lib
# break # break
# end # end
# end # end
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
end end
def startTagA(name, attributes) def startTagA(name, attributes)
if afeAElement = @tree.elementInActiveFormattingElements('a') if afeAElement = @tree.elementInActiveFormattingElements('a')
@parser.parseError(_('Unexpected start tag (a) implies end tag (a).')) parse_error(_('Unexpected start tag (a) implies end tag (a).'))
endTagFormatting('a') endTagFormatting('a')
@tree.openElements.delete(afeAElement) if @tree.openElements.include?(afeAElement) @tree.open_elements.delete(afeAElement) if @tree.open_elements.include?(afeAElement)
@tree.activeFormattingElements.delete(afeAElement) if @tree.activeFormattingElements.include?(afeAElement) @tree.activeFormattingElements.delete(afeAElement) if @tree.activeFormattingElements.include?(afeAElement)
end end
@tree.reconstructActiveFormattingElements @tree.reconstructActiveFormattingElements
@ -188,77 +199,82 @@ module HTML5lib
def startTagNobr(name, attributes) def startTagNobr(name, attributes)
@tree.reconstructActiveFormattingElements @tree.reconstructActiveFormattingElements
processEndTag('nobr') if in_scope?('nobr') if in_scope?('nobr')
parse_error(_('Unexpected start tag (nobr) implies end tag (nobr).'))
processEndTag('nobr')
# XXX Need tests that trigger the following
@tree.reconstructActiveFormattingElements
end
addFormattingElement(name, attributes) addFormattingElement(name, attributes)
end end
def startTagButton(name, attributes) def startTagButton(name, attributes)
if in_scope?('button') if in_scope?('button')
@parser.parseError(_('Unexpected start tag (button) implied end tag (button).')) parse_error(_('Unexpected start tag (button) implied end tag (button).'))
processEndTag('button') processEndTag('button')
@parser.phase.processStartTag(name, attributes) @parser.phase.processStartTag(name, attributes)
else else
@tree.reconstructActiveFormattingElements @tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@tree.activeFormattingElements.push(Marker) @tree.activeFormattingElements.push(Marker)
end end
end end
def startTagMarqueeObject(name, attributes) def startTagMarqueeObject(name, attributes)
@tree.reconstructActiveFormattingElements @tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@tree.activeFormattingElements.push(Marker) @tree.activeFormattingElements.push(Marker)
end end
def startTagXmp(name, attributes) def startTagXmp(name, attributes)
@tree.reconstructActiveFormattingElements @tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA @parser.tokenizer.content_model_flag = :CDATA
end end
def startTagTable(name, attributes) def startTagTable(name, attributes)
processEndTag('p') if in_scope?('p') processEndTag('p') if in_scope?('p')
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inTable] @parser.phase = @parser.phases[:inTable]
end end
def startTagVoidFormatting(name, attributes) def startTagVoidFormatting(name, attributes)
@tree.reconstructActiveFormattingElements @tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@tree.openElements.pop @tree.open_elements.pop
end end
def startTagHr(name, attributes) def startTagHr(name, attributes)
endTagP('p') if in_scope?('p') endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@tree.openElements.pop @tree.open_elements.pop
end end
def startTagImage(name, attributes) def startTagImage(name, attributes)
# No really... # No really...
@parser.parseError(_('Unexpected start tag (image). Treated as img.')) parse_error(_('Unexpected start tag (image). Treated as img.'))
processStartTag('img', attributes) processStartTag('img', attributes)
end end
def startTagInput(name, attributes) def startTagInput(name, attributes)
@tree.reconstructActiveFormattingElements @tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
if @tree.formPointer if @tree.formPointer
# XXX Not exactly sure what to do here # XXX Not exactly sure what to do here
# @tree.openElements[-1].form = @tree.formPointer # @tree.open_elements[-1].form = @tree.formPointer
end end
@tree.openElements.pop @tree.open_elements.pop
end end
def startTagIsindex(name, attributes) def startTagIsindex(name, attributes)
@parser.parseError("Unexpected start tag isindex. Don't use it!") parse_error(_("Unexpected start tag isindex. Don't use it!"))
return if @tree.formPointer return if @tree.formPointer
processStartTag('form', {}) processStartTag('form', {})
processStartTag('hr', {}) processStartTag('hr', {})
processStartTag('p', {}) processStartTag('p', {})
processStartTag('label', {}) processStartTag('label', {})
# XXX Localization ... # XXX Localization ...
processCharacters('This is a searchable index. Insert your search keywords here:') processCharacters('This is a searchable index. Insert your search keywords here: ')
attributes['name'] = 'isindex' attributes['name'] = 'isindex'
attrs = attributes.to_a attrs = attributes.to_a
processStartTag('input', attributes) processStartTag('input', attributes)
@ -270,20 +286,21 @@ module HTML5lib
def startTagTextarea(name, attributes) def startTagTextarea(name, attributes)
# XXX Form element pointer checking here as well... # XXX Form element pointer checking here as well...
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@parser.tokenizer.contentModelFlag = :RCDATA @parser.tokenizer.content_model_flag = :RCDATA
@processSpaceCharactersDropNewline = true @processSpaceCharactersDropNewline = true
alias processSpaceCharacters processSpaceCharactersDropNewline
end end
# iframe, noembed noframes, noscript(if scripting enabled) # iframe, noembed noframes, noscript(if scripting enabled)
def startTagCdata(name, attributes) def startTagCdata(name, attributes)
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA @parser.tokenizer.content_model_flag = :CDATA
end end
def startTagSelect(name, attributes) def startTagSelect(name, attributes)
@tree.reconstructActiveFormattingElements @tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inSelect] @parser.phase = @parser.phases[:inSelect]
end end
@ -293,7 +310,7 @@ module HTML5lib
# "caption", "col", "colgroup", "frame", "frameset", "head", # "caption", "col", "colgroup", "frame", "frameset", "head",
# "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", # "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
# "tr", "noscript" # "tr", "noscript"
@parser.parseError(_("Unexpected start tag (#{name}). Ignored.")) parse_error(_("Unexpected start tag (#{name}). Ignored."))
end end
def startTagNew(name, attributes) def startTagNew(name, attributes)
@ -306,33 +323,38 @@ module HTML5lib
def startTagOther(name, attributes) def startTagOther(name, attributes)
@tree.reconstructActiveFormattingElements @tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
end end
def endTagP(name) def endTagP(name)
@tree.generateImpliedEndTags('p') if in_scope?('p') @tree.generateImpliedEndTags('p') if in_scope?('p')
@parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p' parse_error(_('Unexpected end tag (p).')) unless @tree.open_elements.last.name == 'p'
@tree.openElements.pop while in_scope?('p') if in_scope?('p')
@tree.open_elements.pop while in_scope?('p')
else
startTagCloseP('p', {})
endTagP('p')
end
end end
def endTagBody(name) def endTagBody(name)
# XXX Need to take open <p> tags into account here. We shouldn't imply # XXX Need to take open <p> tags into account here. We shouldn't imply
# </p> but we should not throw a parse error either. Specification is # </p> but we should not throw a parse error either. Specification is
# likely to be updated. # likely to be updated.
unless @tree.openElements[1].name == 'body' unless @tree.open_elements[1].name == 'body'
# innerHTML case # inner_html case
@parser.parseError parse_error
return return
end end
unless @tree.openElements[-1].name == 'body' unless @tree.open_elements.last.name == 'body'
@parser.parseError(_("Unexpected end tag (body). Missing end tag (#{@tree.openElements[-1].name}).")) parse_error(_("Unexpected end tag (body). Missing end tag (#{@tree.open_elements[-1].name})."))
end end
@parser.phase = @parser.phases[:afterBody] @parser.phase = @parser.phases[:afterBody]
end end
def endTagHtml(name) def endTagHtml(name)
endTagBody(name) endTagBody(name)
@parser.phase.processEndTag(name) unless @parser.innerHTML @parser.phase.processEndTag(name) unless @parser.inner_html
end end
def endTagBlock(name) def endTagBlock(name)
@ -341,8 +363,8 @@ module HTML5lib
@tree.generateImpliedEndTags if in_scope?(name) @tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name unless @tree.open_elements.last.name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag.")) parse_error(_("End tag (#{name}) seen too early. Expected other end tag."))
end end
if in_scope?(name) if in_scope?(name)
@ -351,18 +373,23 @@ module HTML5lib
end end
def endTagForm(name) def endTagForm(name)
endTagBlock(name) if in_scope?(name)
@tree.generateImpliedEndTags
end
if @tree.open_elements.last.name != name
parse_error(_("End tag (form) seen too early. Ignored."))
else
@tree.open_elements.pop
end
@tree.formPointer = nil @tree.formPointer = nil
end end
def endTagListItem(name) def endTagListItem(name)
# AT Could merge this with the Block case # AT Could merge this with the Block case
if in_scope?(name) @tree.generateImpliedEndTags(name) if in_scope?(name)
@tree.generateImpliedEndTags(name)
unless @tree.openElements[-1].name == name unless @tree.open_elements.last.name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag.")) parse_error(_("End tag (#{name}) seen too early. " + 'Expected other end tag.'))
end
end end
remove_open_elements_until(name) if in_scope?(name) remove_open_elements_until(name) if in_scope?(name)
@ -376,13 +403,13 @@ module HTML5lib
end end
end end
unless @tree.openElements[-1].name == name unless @tree.open_elements.last.name == name
@parser.parseError(("Unexpected end tag (#{name}). Expected other end tag.")) parse_error(_("Unexpected end tag (#{name}). Expected other end tag."))
end end
HEADING_ELEMENTS.each do |element| HEADING_ELEMENTS.each do |element|
if in_scope?(element) if in_scope?(element)
remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) } remove_open_elements_until {|element| HEADING_ELEMENTS.include?(element.name)}
break break
end end
end end
@ -391,30 +418,30 @@ module HTML5lib
# The much-feared adoption agency algorithm # The much-feared adoption agency algorithm
def endTagFormatting(name) def endTagFormatting(name)
# http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency # http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
# XXX Better parseError messages appreciated. # XXX Better parse_error messages appreciated.
while true while true
# Step 1 paragraph 1 # Step 1 paragraph 1
afeElement = @tree.elementInActiveFormattingElements(name) afeElement = @tree.elementInActiveFormattingElements(name)
if not afeElement or (@tree.openElements.include?(afeElement) and not in_scope?(afeElement.name)) if !afeElement or (@tree.open_elements.include?(afeElement) && !in_scope?(afeElement.name))
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 1 of the adoption agency algorithm.")) parse_error(_("End tag (#{name}) violates step 1, paragraph 1 of the adoption agency algorithm."))
return return
# Step 1 paragraph 2 # Step 1 paragraph 2
elsif not @tree.openElements.include?(afeElement) elsif not @tree.open_elements.include?(afeElement)
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 2 of the adoption agency algorithm.")) parse_error(_("End tag (#{name}) violates step 1, paragraph 2 of the adoption agency algorithm."))
@tree.activeFormattingElements.delete(afeElement) @tree.activeFormattingElements.delete(afeElement)
return return
end end
# Step 1 paragraph 3 # Step 1 paragraph 3
if afeElement != @tree.openElements[-1] if afeElement != @tree.open_elements.last
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 3 of the adoption agency algorithm.")) parse_error(_("End tag (#{name}) violates step 1, paragraph 3 of the adoption agency algorithm."))
end end
# Step 2 # Step 2
# Start of the adoption agency algorithm proper # Start of the adoption agency algorithm proper
afeIndex = @tree.openElements.index(afeElement) afeIndex = @tree.open_elements.index(afeElement)
furthestBlock = nil furthestBlock = nil
@tree.openElements[afeIndex..-1].each do |element| @tree.open_elements[afeIndex..-1].each do |element|
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(element.name) if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(element.name)
furthestBlock = element furthestBlock = element
break break
@ -423,11 +450,11 @@ module HTML5lib
# Step 3 # Step 3
if furthestBlock.nil? if furthestBlock.nil?
element = remove_open_elements_until { |element| element == afeElement } element = remove_open_elements_until {|element| element == afeElement }
@tree.activeFormattingElements.delete(element) @tree.activeFormattingElements.delete(element)
return return
end end
commonAncestor = @tree.openElements[afeIndex - 1] commonAncestor = @tree.open_elements[afeIndex - 1]
# Step 5 # Step 5
furthestBlock.parent.removeChild(furthestBlock) if furthestBlock.parent furthestBlock.parent.removeChild(furthestBlock) if furthestBlock.parent
@ -444,11 +471,11 @@ module HTML5lib
while true while true
# AT replace this with a function and recursion? # AT replace this with a function and recursion?
# Node is element before node in open elements # Node is element before node in open elements
node = @tree.openElements[@tree.openElements.index(node) - 1] node = @tree.open_elements[@tree.open_elements.index(node) - 1]
until @tree.activeFormattingElements.include?(node) until @tree.activeFormattingElements.include?(node)
tmpNode = node tmpNode = node
node = @tree.openElements[@tree.openElements.index(node) - 1] node = @tree.open_elements[@tree.open_elements.index(node) - 1]
@tree.openElements.delete(tmpNode) @tree.open_elements.delete(tmpNode)
end end
# Step 7.3 # Step 7.3
break if node == afeElement break if node == afeElement
@ -465,7 +492,7 @@ module HTML5lib
clone = node.cloneNode clone = node.cloneNode
# Replace node with clone # Replace node with clone
@tree.activeFormattingElements[@tree.activeFormattingElements.index(node)] = clone @tree.activeFormattingElements[@tree.activeFormattingElements.index(node)] = clone
@tree.openElements[@tree.openElements.index(node)] = clone @tree.open_elements[@tree.open_elements.index(node)] = clone
node = clone node = clone
end end
# Step 7.6 # Step 7.6
@ -495,16 +522,16 @@ module HTML5lib
@tree.activeFormattingElements.insert([bookmark,@tree.activeFormattingElements.length].min, clone) @tree.activeFormattingElements.insert([bookmark,@tree.activeFormattingElements.length].min, clone)
# Step 13 # Step 13
@tree.openElements.delete(afeElement) @tree.open_elements.delete(afeElement)
@tree.openElements.insert(@tree.openElements.index(furthestBlock) + 1, clone) @tree.open_elements.insert(@tree.open_elements.index(furthestBlock) + 1, clone)
end end
end end
def endTagButtonMarqueeObject(name) def endTagButtonMarqueeObject(name)
@tree.generateImpliedEndTags if in_scope?(name) @tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name unless @tree.open_elements.last.name == name
@parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag first.")) parse_error(_("Unexpected end tag (#{name}). Expected other end tag first."))
end end
if in_scope?(name) if in_scope?(name)
@ -516,26 +543,26 @@ module HTML5lib
def endTagMisplaced(name) def endTagMisplaced(name)
# This handles elements with end tags in other insertion modes. # This handles elements with end tags in other insertion modes.
@parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) parse_error(_("Unexpected end tag (#{name}). Ignored."))
end end
def endTagBr(name) def endTagBr(name)
@parser.parseError(_("Unexpected end tag (br). Treated as br element.")) parse_error(_("Unexpected end tag (br). Treated as br element."))
@tree.reconstructActiveFormattingElements @tree.reconstructActiveFormattingElements
@tree.insertElement(name, {}) @tree.insert_element(name, {})
@tree.openElements.pop() @tree.open_elements.pop()
end end
def endTagNone(name) def endTagNone(name)
# This handles elements with no end tag. # This handles elements with no end tag.
@parser.parseError(_("This tag (#{name}) has no end tag")) parse_error(_("This tag (#{name}) has no end tag"))
end end
def endTagCdataTextAreaXmp(name) def endTagCdataTextAreaXmp(name)
if @tree.openElements[-1].name == name if @tree.open_elements.last.name == name
@tree.openElements.pop @tree.open_elements.pop
else else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) parse_error(_("Unexpected end tag (#{name}). Ignored."))
end end
end end
@ -549,20 +576,20 @@ module HTML5lib
def endTagOther(name) def endTagOther(name)
# XXX This logic should be moved into the treebuilder # XXX This logic should be moved into the treebuilder
@tree.openElements.reverse.each do |node| @tree.open_elements.reverse.each do |node|
if node.name == name if node.name == name
@tree.generateImpliedEndTags @tree.generateImpliedEndTags
unless @tree.openElements[-1].name == name unless @tree.open_elements.last.name == name
@parser.parseError(_("Unexpected end tag (#{name}).")) parse_error(_("Unexpected end tag (#{name})."))
end end
remove_open_elements_until { |element| element == node } remove_open_elements_until {|element| element == node }
break break
else else
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) parse_error(_("Unexpected end tag (#{name}). Ignored."))
break break
end end
end end
@ -572,8 +599,8 @@ module HTML5lib
protected protected
def addFormattingElement(name, attributes) def addFormattingElement(name, attributes)
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@tree.activeFormattingElements.push(@tree.openElements[-1]) @tree.activeFormattingElements.push(@tree.open_elements.last)
end end
end end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InCaptionPhase < Phase class InCaptionPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
@ -10,7 +10,7 @@ module HTML5lib
handle_end 'caption', 'table', %w( body col colgroup html tbody td tfoot th thead tr ) => 'Ignore' handle_end 'caption', 'table', %w( body col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def ignoreEndTagCaption def ignoreEndTagCaption
not in_scope?('caption', true) !in_scope?('caption', true)
end end
def processCharacters(data) def processCharacters(data)
@ -18,7 +18,7 @@ module HTML5lib
end end
def startTagTableElement(name, attributes) def startTagTableElement(name, attributes)
@parser.parseError parse_error
#XXX Have to duplicate logic here to find out if the tag is ignored #XXX Have to duplicate logic here to find out if the tag is ignored
ignoreEndTag = ignoreEndTagCaption ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption') @parser.phase.processEndTag('caption')
@ -31,15 +31,15 @@ module HTML5lib
def endTagCaption(name) def endTagCaption(name)
if ignoreEndTagCaption if ignoreEndTagCaption
# innerHTML case # inner_html case
assert @parser.innerHTML assert @parser.inner_html
@parser.parseError parse_error
else else
# AT this code is quite similar to endTagTable in "InTable" # AT this code is quite similar to endTagTable in "InTable"
@tree.generateImpliedEndTags @tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'caption' unless @tree.open_elements[-1].name == 'caption'
@parser.parseError(_("Unexpected end tag (caption). Missing end tags.")) parse_error(_("Unexpected end tag (caption). Missing end tags."))
end end
remove_open_elements_until('caption') remove_open_elements_until('caption')
@ -50,14 +50,14 @@ module HTML5lib
end end
def endTagTable(name) def endTagTable(name)
@parser.parseError parse_error
ignoreEndTag = ignoreEndTagCaption ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption') @parser.phase.processEndTag('caption')
@parser.phase.processEndTag(name) unless ignoreEndTag @parser.phase.processEndTag(name) unless ignoreEndTag
end end
def endTagIgnore(name) def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) parse_error(_("Unexpected end tag (#{name}). Ignored."))
end end
def endTagOther(name) def endTagOther(name)

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InCellPhase < Phase class InCellPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
@ -20,8 +20,8 @@ module HTML5lib
closeCell closeCell
@parser.phase.processStartTag(name, attributes) @parser.phase.processStartTag(name, attributes)
else else
# innerHTML case # inner_html case
@parser.parseError parse_error
end end
end end
@ -32,22 +32,22 @@ module HTML5lib
def endTagTableCell(name) def endTagTableCell(name)
if in_scope?(name, true) if in_scope?(name, true)
@tree.generateImpliedEndTags(name) @tree.generateImpliedEndTags(name)
if @tree.openElements[-1].name != name if @tree.open_elements.last.name != name
@parser.parseError("Got table cell end tag (#{name}) while required end tags are missing.") parse_error("Got table cell end tag (#{name}) while required end tags are missing.")
remove_open_elements_until(name) remove_open_elements_until(name)
else else
@tree.openElements.pop @tree.open_elements.pop
end end
@tree.clearActiveFormattingElements @tree.clearActiveFormattingElements
@parser.phase = @parser.phases[:inRow] @parser.phase = @parser.phases[:inRow]
else else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) parse_error(_("Unexpected end tag (#{name}). Ignored."))
end end
end end
def endTagIgnore(name) def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) parse_error(_("Unexpected end tag (#{name}). Ignored."))
end end
def endTagImply(name) def endTagImply(name)
@ -55,8 +55,8 @@ module HTML5lib
closeCell closeCell
@parser.phase.processEndTag(name) @parser.phase.processEndTag(name)
else else
# sometimes innerHTML case # sometimes inner_html case
@parser.parseError parse_error
end end
end end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InColumnGroupPhase < Phase class InColumnGroupPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-column # http://www.whatwg.org/specs/web-apps/current-work/#in-column
@ -10,7 +10,7 @@ module HTML5lib
handle_end 'colgroup', 'col' handle_end 'colgroup', 'col'
def ignoreEndTagColgroup def ignoreEndTagColgroup
@tree.openElements[-1].name == 'html' @tree.open_elements[-1].name == 'html'
end end
def processCharacters(data) def processCharacters(data)
@ -20,8 +20,8 @@ module HTML5lib
end end
def startTagCol(name, attributes) def startTagCol(name, attributes)
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@tree.openElements.pop @tree.open_elements.pop
end end
def startTagOther(name, attributes) def startTagOther(name, attributes)
@ -32,17 +32,17 @@ module HTML5lib
def endTagColgroup(name) def endTagColgroup(name)
if ignoreEndTagColgroup if ignoreEndTagColgroup
# innerHTML case # inner_html case
assert @parser.innerHTML assert @parser.inner_html
@parser.parseError parse_error
else else
@tree.openElements.pop @tree.open_elements.pop
@parser.phase = @parser.phases[:inTable] @parser.phase = @parser.phases[:inTable]
end end
end end
def endTagCol(name) def endTagCol(name)
@parser.parseError(_('Unexpected end tag (col). col has no end tag.')) parse_error(_('Unexpected end tag (col). col has no end tag.'))
end end
def endTagOther(name) def endTagOther(name)

View file

@ -0,0 +1,57 @@
require 'html5/html5parser/phase'
module HTML5
class InFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
handle_start 'html', 'frameset', 'frame', 'noframes'
handle_end 'frameset', 'noframes'
def processCharacters(data)
parse_error(_('Unexpected characters in the frameset phase. Characters ignored.'))
end
def startTagFrameset(name, attributes)
@tree.insert_element(name, attributes)
end
def startTagFrame(name, attributes)
@tree.insert_element(name, attributes)
@tree.open_elements.pop
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
parse_error(_("Unexpected start tag token (#{name}) in the frameset phase. Ignored"))
end
def endTagFrameset(name)
if @tree.open_elements.last.name == 'html'
# inner_html case
parse_error(_("Unexpected end tag token (frameset) in the frameset phase (inner_html)."))
else
@tree.open_elements.pop
end
if (not @parser.inner_html and
@tree.open_elements.last.name != 'frameset')
# If we're not in inner_html mode and the the current node is not a
# "frameset" element (anymore) then switch.
@parser.phase = @parser.phases[:afterFrameset]
end
end
def endTagNoframes(name)
@parser.phases[:inBody].processEndTag(name)
end
def endTagOther(name)
parse_error(_("Unexpected end tag token (#{name}) in the frameset phase. Ignored."))
end
end
end

View file

@ -0,0 +1,138 @@
require 'html5/html5parser/phase'
module HTML5
class InHeadPhase < Phase
handle_start 'html', 'head', 'title', 'style', 'script', 'noscript'
handle_start %w( base link meta )
handle_end 'head'
handle_end %w( html body br p ) => 'ImplyAfterHead'
handle_end %w( title style script noscript )
def process_eof
if ['title', 'style', 'script'].include?(name = @tree.open_elements.last.name)
parse_error(_("Unexpected end of file. Expected end tag (#{name})."))
@tree.open_elements.pop
end
anything_else
@parser.phase.process_eof
end
def processCharacters(data)
if %w[title style script noscript].include?(@tree.open_elements.last.name)
@tree.insertText(data)
else
anything_else
@parser.phase.processCharacters(data)
end
end
def startTagHead(name, attributes)
parse_error(_('Unexpected start tag head in existing head. Ignored'))
end
def startTagTitle(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
@tree.open_elements.push(element)
@parser.tokenizer.content_model_flag = :RCDATA
end
def startTagStyle(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.head_pointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.open_elements.last.appendChild(element)
end
@tree.open_elements.push(element)
@parser.tokenizer.content_model_flag = :CDATA
end
def startTagNoscript(name, attributes)
# XXX Need to decide whether to implement the scripting disabled case.
element = @tree.createElement(name, attributes)
if @tree.head_pointer !=nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.open_elements.last.appendChild(element)
end
@tree.open_elements.push(element)
@parser.tokenizer.content_model_flag = :CDATA
end
def startTagScript(name, attributes)
#XXX Inner HTML case may be wrong
element = @tree.createElement(name, attributes)
element._flags.push("parser-inserted")
if @tree.head_pointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.open_elements.last.appendChild(element)
end
@tree.open_elements.push(element)
@parser.tokenizer.content_model_flag = :CDATA
end
def startTagBaseLinkMeta(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.head_pointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.open_elements.last.appendChild(element)
end
end
def startTagOther(name, attributes)
anything_else
@parser.phase.processStartTag(name, attributes)
end
def endTagHead(name)
if @tree.open_elements.last.name == 'head'
@tree.open_elements.pop
else
parse_error(_("Unexpected end tag (head). Ignored."))
end
@parser.phase = @parser.phases[:afterHead]
end
def endTagImplyAfterHead(name)
anything_else
@parser.phase.processEndTag(name)
end
def endTagTitleStyleScriptNoscript(name)
if @tree.open_elements.last.name == name
@tree.open_elements.pop
else
parse_error(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagOther(name)
parse_error(_("Unexpected end tag (#{name}). Ignored."))
end
def anything_else
if @tree.open_elements.last.name == 'head'
endTagHead('head')
else
@parser.phase = @parser.phases[:afterHead]
end
end
protected
def appendToHead(element)
if @tree.head_pointer.nil?
assert @parser.inner_html
@tree.open_elements.last.appendChild(element)
else
@tree.head_pointer.appendChild(element)
end
end
end
end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InRowPhase < Phase class InRowPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-row # http://www.whatwg.org/specs/web-apps/current-work/#in-row
@ -15,7 +15,7 @@ module HTML5lib
def startTagTableCell(name, attributes) def startTagTableCell(name, attributes)
clearStackToTableRowContext clearStackToTableRowContext
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inCell] @parser.phase = @parser.phases[:inCell]
@tree.activeFormattingElements.push(Marker) @tree.activeFormattingElements.push(Marker)
end end
@ -23,7 +23,7 @@ module HTML5lib
def startTagTableOther(name, attributes) def startTagTableOther(name, attributes)
ignoreEndTag = ignoreEndTagTr ignoreEndTag = ignoreEndTagTr
endTagTr('tr') endTagTr('tr')
# XXX how are we sure it's always ignored in the innerHTML case? # XXX how are we sure it's always ignored in the inner_html case?
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag @parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end end
@ -33,12 +33,12 @@ module HTML5lib
def endTagTr(name) def endTagTr(name)
if ignoreEndTagTr if ignoreEndTagTr
# innerHTML case # inner_html case
assert @parser.innerHTML assert @parser.inner_html
@parser.parseError parse_error
else else
clearStackToTableRowContext clearStackToTableRowContext
@tree.openElements.pop @tree.open_elements.pop
@parser.phase = @parser.phases[:inTableBody] @parser.phase = @parser.phases[:inTableBody]
end end
end end
@ -47,7 +47,7 @@ module HTML5lib
ignoreEndTag = ignoreEndTagTr ignoreEndTag = ignoreEndTagTr
endTagTr('tr') endTagTr('tr')
# Reprocess the current tag if the tr end tag was not ignored # Reprocess the current tag if the tr end tag was not ignored
# XXX how are we sure it's always ignored in the innerHTML case? # XXX how are we sure it's always ignored in the inner_html case?
@parser.phase.processEndTag(name) unless ignoreEndTag @parser.phase.processEndTag(name) unless ignoreEndTag
end end
@ -56,13 +56,13 @@ module HTML5lib
endTagTr('tr') endTagTr('tr')
@parser.phase.processEndTag(name) @parser.phase.processEndTag(name)
else else
# innerHTML case # inner_html case
@parser.parseError parse_error
end end
end end
def endTagIgnore(name) def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the row phase. Ignored.")) parse_error(_("Unexpected end tag (#{name}) in the row phase. Ignored."))
end end
def endTagOther(name) def endTagOther(name)
@ -73,9 +73,9 @@ module HTML5lib
# XXX unify this with other table helper methods # XXX unify this with other table helper methods
def clearStackToTableRowContext def clearStackToTableRowContext
until ['tr', 'html'].include?(name = @tree.openElements[-1].name) until %w[tr html].include?(name = @tree.open_elements.last.name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the row phase.")) parse_error(_("Unexpected implied end tag (#{name}) in the row phase."))
@tree.openElements.pop @tree.open_elements.pop
end end
end end

View file

@ -0,0 +1,84 @@
require 'html5/html5parser/phase'
module HTML5
class InSelectPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
handle_start 'html', 'option', 'optgroup', 'select'
handle_end 'option', 'optgroup', 'select', %w( caption table tbody tfoot thead tr td th ) => 'TableElements'
def processCharacters(data)
@tree.insertText(data)
end
def startTagOption(name, attributes)
# We need to imply </option> if <option> is the current node.
@tree.open_elements.pop if @tree.open_elements.last.name == 'option'
@tree.insert_element(name, attributes)
end
def startTagOptgroup(name, attributes)
@tree.open_elements.pop if @tree.open_elements.last.name == 'option'
@tree.open_elements.pop if @tree.open_elements.last.name == 'optgroup'
@tree.insert_element(name, attributes)
end
def startTagSelect(name, attributes)
parse_error(_('Unexpected start tag (select) in the select phase implies select start tag.'))
endTagSelect('select')
end
def startTagOther(name, attributes)
parse_error(_('Unexpected start tag token (#{name}) in the select phase. Ignored.'))
end
def endTagOption(name)
if @tree.open_elements.last.name == 'option'
@tree.open_elements.pop
else
parse_error(_('Unexpected end tag (option) in the select phase. Ignored.'))
end
end
def endTagOptgroup(name)
# </optgroup> implicitly closes <option>
if @tree.open_elements.last.name == 'option' and @tree.open_elements[-2].name == 'optgroup'
@tree.open_elements.pop
end
# It also closes </optgroup>
if @tree.open_elements.last.name == 'optgroup'
@tree.open_elements.pop
# But nothing else
else
parse_error(_('Unexpected end tag (optgroup) in the select phase. Ignored.'))
end
end
def endTagSelect(name)
if in_scope?('select', true)
remove_open_elements_until('select')
@parser.reset_insertion_mode
else
# inner_html case
parse_error
end
end
def endTagTableElements(name)
parse_error(_("Unexpected table end tag (#{name}) in the select phase."))
if in_scope?(name, true)
endTagSelect('select')
@parser.phase.processEndTag(name)
end
end
def endTagOther(name)
parse_error(_("Unexpected end tag token (#{name}) in the select phase. Ignored."))
end
end
end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InTableBodyPhase < Phase class InTableBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
@ -15,12 +15,12 @@ module HTML5lib
def startTagTr(name, attributes) def startTagTr(name, attributes)
clearStackToTableBodyContext clearStackToTableBodyContext
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inRow] @parser.phase = @parser.phases[:inRow]
end end
def startTagTableCell(name, attributes) def startTagTableCell(name, attributes)
@parser.parseError(_("Unexpected table cell start tag (#{name}) in the table body phase.")) parse_error(_("Unexpected table cell start tag (#{name}) in the table body phase."))
startTagTr('tr', {}) startTagTr('tr', {})
@parser.phase.processStartTag(name, attributes) @parser.phase.processStartTag(name, attributes)
end end
@ -29,11 +29,11 @@ module HTML5lib
# XXX AT Any ideas on how to share this with endTagTable? # XXX AT Any ideas on how to share this with endTagTable?
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true) if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name) endTagTableRowGroup(@tree.open_elements.last.name)
@parser.phase.processStartTag(name, attributes) @parser.phase.processStartTag(name, attributes)
else else
# innerHTML case # inner_html case
@parser.parseError parse_error
end end
end end
@ -44,26 +44,26 @@ module HTML5lib
def endTagTableRowGroup(name) def endTagTableRowGroup(name)
if in_scope?(name, true) if in_scope?(name, true)
clearStackToTableBodyContext clearStackToTableBodyContext
@tree.openElements.pop @tree.open_elements.pop
@parser.phase = @parser.phases[:inTable] @parser.phase = @parser.phases[:inTable]
else else
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored.")) parse_error(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end end
end end
def endTagTable(name) def endTagTable(name)
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true) if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name) endTagTableRowGroup(@tree.open_elements.last.name)
@parser.phase.processEndTag(name) @parser.phase.processEndTag(name)
else else
# innerHTML case # inner_html case
@parser.parseError parse_error
end end
end end
def endTagIgnore(name) def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored.")) parse_error(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end end
def endTagOther(name) def endTagOther(name)
@ -73,9 +73,9 @@ module HTML5lib
protected protected
def clearStackToTableBodyContext def clearStackToTableBodyContext
until ['tbody', 'tfoot', 'thead', 'html'].include?(name = @tree.openElements[-1].name) until %w[tbody tfoot thead html].include?(name = @tree.open_elements.last.name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table body phase.")) parse_error(_("Unexpected implied end tag (#{name}) in the table body phase."))
@tree.openElements.pop @tree.open_elements.pop
end end
end end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InTablePhase < Phase class InTablePhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table # http://www.whatwg.org/specs/web-apps/current-work/#in-table
@ -12,24 +12,24 @@ module HTML5lib
handle_end 'table', %w( body caption col colgroup html tbody td tfoot th thead tr ) => 'Ignore' handle_end 'table', %w( body caption col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def processCharacters(data) def processCharacters(data)
@parser.parseError(_("Unexpected non-space characters in table context caused voodoo mode.")) parse_error(_("Unexpected non-space characters in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in # Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true @tree.insert_from_table = true
# Process the character in the "in body" mode # Process the character in the "in body" mode
@parser.phases[:inBody].processCharacters(data) @parser.phases[:inBody].processCharacters(data)
@tree.insertFromTable = false @tree.insert_from_table = false
end end
def startTagCaption(name, attributes) def startTagCaption(name, attributes)
clearStackToTableContext clearStackToTableContext
@tree.activeFormattingElements.push(Marker) @tree.activeFormattingElements.push(Marker)
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inCaption] @parser.phase = @parser.phases[:inCaption]
end end
def startTagColgroup(name, attributes) def startTagColgroup(name, attributes)
clearStackToTableContext clearStackToTableContext
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inColumnGroup] @parser.phase = @parser.phases[:inColumnGroup]
end end
@ -40,7 +40,7 @@ module HTML5lib
def startTagRowGroup(name, attributes) def startTagRowGroup(name, attributes)
clearStackToTableContext clearStackToTableContext
@tree.insertElement(name, attributes) @tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inTableBody] @parser.phase = @parser.phases[:inTableBody]
end end
@ -50,60 +50,60 @@ module HTML5lib
end end
def startTagTable(name, attributes) def startTagTable(name, attributes)
@parser.parseError(_("Unexpected start tag (table) in table phase. Implies end tag (table).")) parse_error(_("Unexpected start tag (table) in table phase. Implies end tag (table)."))
@parser.phase.processEndTag('table') @parser.phase.processEndTag('table')
@parser.phase.processStartTag(name, attributes) unless @parser.innerHTML @parser.phase.processStartTag(name, attributes) unless @parser.inner_html
end end
def startTagOther(name, attributes) def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in table context caused voodoo mode.")) parse_error(_("Unexpected start tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in # Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true @tree.insert_from_table = true
# Process the start tag in the "in body" mode # Process the start tag in the "in body" mode
@parser.phases[:inBody].processStartTag(name, attributes) @parser.phases[:inBody].processStartTag(name, attributes)
@tree.insertFromTable = false @tree.insert_from_table = false
end end
def endTagTable(name) def endTagTable(name)
if in_scope?('table', true) if in_scope?('table', true)
@tree.generateImpliedEndTags @tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'table' unless @tree.open_elements.last.name == 'table'
@parser.parseError(_("Unexpected end tag (table). Expected end tag (#{@tree.openElements[-1].name}).")) parse_error(_("Unexpected end tag (table). Expected end tag (#{@tree.open_elements.last.name})."))
end end
remove_open_elements_until('table') remove_open_elements_until('table')
@parser.resetInsertionMode @parser.reset_insertion_mode
else else
# innerHTML case # inner_html case
assert @parser.innerHTML assert @parser.inner_html
@parser.parseError parse_error
end end
end end
def endTagIgnore(name) def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) parse_error(_("Unexpected end tag (#{name}). Ignored."))
end end
def endTagOther(name) def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in table context caused voodoo mode.")) parse_error(_("Unexpected end tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in # Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true @tree.insert_from_table = true
# Process the end tag in the "in body" mode # Process the end tag in the "in body" mode
@parser.phases[:inBody].processEndTag(name) @parser.phases[:inBody].processEndTag(name)
@tree.insertFromTable = false @tree.insert_from_table = false
end end
protected protected
def clearStackToTableContext def clearStackToTableContext
# "clear the stack back to a table context" # "clear the stack back to a table context"
until ['table', 'html'].include?(name = @tree.openElements[-1].name) until %w[table html].include?(name = @tree.open_elements.last.name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table phase.")) parse_error(_("Unexpected implied end tag (#{name}) in the table phase."))
@tree.openElements.pop @tree.open_elements.pop
end end
# When the current node is <html> it's an innerHTML case # When the current node is <html> it's an inner_html case
end end
end end

View file

@ -1,28 +1,28 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InitialPhase < Phase class InitialPhase < Phase
# This phase deals with error handling as well which is currently not # This phase deals with error handling as well which is currently not
# covered in the specification. The error handling is typically known as # covered in the specification. The error handling is typically known as
# "quirks mode". It is expected that a future version of HTML5 will define this. # "quirks mode". It is expected that a future version of HTML5 will define this.
def processEOF def process_eof
@parser.parseError(_('Unexpected End of file. Expected DOCTYPE.')) parse_error(_('Unexpected End of file. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement] @parser.phase = @parser.phases[:rootElement]
@parser.phase.processEOF @parser.phase.process_eof
end end
def processComment(data) def processComment(data)
@tree.insertComment(data, @tree.document) @tree.insert_comment(data, @tree.document)
end end
def processDoctype(name, publicId, systemId, correct) def processDoctype(name, publicId, systemId, correct)
if name.downcase != 'html' or publicId or systemId if name.downcase != 'html' or publicId or systemId
@parser.parseError(_('Erroneous DOCTYPE.')) parse_error(_('Erroneous DOCTYPE.'))
end end
# XXX need to update DOCTYPE tokens # XXX need to update DOCTYPE tokens
@tree.insertDoctype(name) @tree.insertDoctype(name, publicId, systemId)
publicId = publicId.to_s.upcase publicId = publicId.to_s.upcase
@ -110,23 +110,22 @@ module HTML5lib
end end
def processSpaceCharacters(data) def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end end
def processCharacters(data) def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected DOCTYPE.')) parse_error(_('Unexpected non-space characters. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement] @parser.phase = @parser.phases[:rootElement]
@parser.phase.processCharacters(data) @parser.phase.processCharacters(data)
end end
def processStartTag(name, attributes) def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}). Expected DOCTYPE.")) parse_error(_("Unexpected start tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement] @parser.phase = @parser.phases[:rootElement]
@parser.phase.processStartTag(name, attributes) @parser.phase.processStartTag(name, attributes)
end end
def processEndTag(name) def processEndTag(name)
@parser.parseError(_("Unexpected end tag (#{name}). Expected DOCTYPE.")) parse_error(_("Unexpected end tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement] @parser.phase = @parser.phases[:rootElement]
@parser.phase.processEndTag(name) @parser.phase.processEndTag(name)
end end

View file

@ -1,4 +1,4 @@
module HTML5lib module HTML5
# Base class for helper objects that implement each phase of processing. # Base class for helper objects that implement each phase of processing.
# #
# Handler methods should be in the following order (they can be omitted): # Handler methods should be in the following order (they can be omitted):
@ -15,9 +15,12 @@ module HTML5lib
# #
class Phase class Phase
extend Forwardable
def_delegators :@parser, :parse_error
# The following example call: # The following example call:
# #
# tag_handlers('startTag', 'html', %( base link meta ), %( li dt dd ) => 'ListItem') # tag_handlers('startTag', 'html', %w( base link meta ), %w( li dt dd ) => 'ListItem')
# #
# ...would return a hash equal to this: # ...would return a hash equal to this:
# #
@ -34,15 +37,15 @@ module HTML5lib
if tags.last.is_a?(Hash) if tags.last.is_a?(Hash)
tags.pop.each do |names, handler_method_suffix| tags.pop.each do |names, handler_method_suffix|
handler_method = prefix + handler_method_suffix handler_method = prefix + handler_method_suffix
Array(names).each { |name| mapping[name] = handler_method } Array(names).each {|name| mapping[name] = handler_method }
end end
end end
tags.each do |names| tags.each do |names|
names = Array(names) names = Array(names)
handler_method = prefix + names.map { |name| name.capitalize }.join handler_method = prefix + names.map {|name| name.capitalize }.join
names.each { |name| mapping[name] = handler_method } names.each {|name| mapping[name] = handler_method }
end end
return mapping mapping
end end
def self.start_tag_handlers def self.start_tag_handlers
@ -80,17 +83,17 @@ module HTML5lib
@parser, @tree = parser, tree @parser, @tree = parser, tree
end end
def processEOF def process_eof
@tree.generateImpliedEndTags @tree.generateImpliedEndTags
if @tree.openElements.length > 2 if @tree.open_elements.length > 2
@parser.parseError(_('Unexpected end of file. Missing closing tags.')) parse_error(_('Unexpected end of file. Missing closing tags.'))
elsif @tree.openElements.length == 2 and @tree.openElements[1].name != 'body' elsif @tree.open_elements.length == 2 and @tree.open_elements[1].name != 'body'
# This happens for framesets or something? # This happens for framesets or something?
@parser.parseError(_("Unexpected end of file. Expected end tag (#{@tree.openElements[1].name}) first.")) parse_error(_("Unexpected end of file. Expected end tag (#{@tree.open_elements[1].name}) first."))
elsif @parser.innerHTML and @tree.openElements.length > 1 elsif @parser.inner_html and @tree.open_elements.length > 1
# XXX This is not what the specification says. Not sure what to do here. # XXX This is not what the specification says. Not sure what to do here.
@parser.parseError(_('XXX innerHTML EOF')) parse_error(_('XXX inner_html EOF'))
end end
# Betting ends. # Betting ends.
end end
@ -98,11 +101,11 @@ module HTML5lib
def processComment(data) def processComment(data)
# For most phases the following is correct. Where it's not it will be # For most phases the following is correct. Where it's not it will be
# overridden. # overridden.
@tree.insertComment(data, @tree.openElements[-1]) @tree.insert_comment(data, @tree.open_elements.last)
end end
def processDoctype(name, publicId, systemId, correct) def processDoctype(name, publicId, systemId, correct)
@parser.parseError(_('Unexpected DOCTYPE. Ignored.')) parse_error(_('Unexpected DOCTYPE. Ignored.'))
end end
def processSpaceCharacters(data) def processSpaceCharacters(data)
@ -114,17 +117,17 @@ module HTML5lib
end end
def startTagHtml(name, attributes) def startTagHtml(name, attributes)
if @parser.firstStartTag == false and name == 'html' if @parser.first_start_tag == false and name == 'html'
@parser.parseError(_('html needs to be the first start tag.')) parse_error(_('html needs to be the first start tag.'))
end end
# XXX Need a check here to see if the first start tag token emitted is # XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke @parser.parseError. # this token... If it's not, invoke parse_error.
attributes.each do |attr, value| attributes.each do |attr, value|
unless @tree.openElements[0].attributes.has_key?(attr) unless @tree.open_elements.first.attributes.has_key?(attr)
@tree.openElements[0].attributes[attr] = value @tree.open_elements.first.attributes[attr] = value
end end
end end
@parser.firstStartTag = false @parser.first_start_tag = false
end end
def processEndTag(name) def processEndTag(name)
@ -146,11 +149,10 @@ module HTML5lib
def remove_open_elements_until(name=nil) def remove_open_elements_until(name=nil)
finished = false finished = false
until finished until finished
element = @tree.openElements.pop element = @tree.open_elements.pop
finished = name.nil?? yield(element) : element.name == name finished = name.nil? ? yield(element) : element.name == name
end end
return element return element
end end
end end
end end

View file

@ -1,40 +1,39 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class RootElementPhase < Phase class RootElementPhase < Phase
def processEOF def process_eof
insertHtmlElement insert_html_element
@parser.phase.processEOF @parser.phase.process_eof
end end
def processComment(data) def processComment(data)
@tree.insertComment(data, @tree.document) @tree.insert_comment(data, @tree.document)
end end
def processSpaceCharacters(data) def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end end
def processCharacters(data) def processCharacters(data)
insertHtmlElement insert_html_element
@parser.phase.processCharacters(data) @parser.phase.processCharacters(data)
end end
def processStartTag(name, attributes) def processStartTag(name, attributes)
@parser.firstStartTag = true if name == 'html' @parser.first_start_tag = true if name == 'html'
insertHtmlElement insert_html_element
@parser.phase.processStartTag(name, attributes) @parser.phase.processStartTag(name, attributes)
end end
def processEndTag(name) def processEndTag(name)
insertHtmlElement insert_html_element
@parser.phase.processEndTag(name) @parser.phase.processEndTag(name)
end end
def insertHtmlElement def insert_html_element
element = @tree.createElement('html', {}) element = @tree.createElement('html', {})
@tree.openElements.push(element) @tree.open_elements.push(element)
@tree.document.appendChild(element) @tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead] @parser.phase = @parser.phases[:beforeHead]
end end

View file

@ -0,0 +1,35 @@
require 'html5/html5parser/phase'
module HTML5
class TrailingEndPhase < Phase
def process_eof
end
def processComment(data)
@tree.insert_comment(data, @tree.document)
end
def processSpaceCharacters(data)
@parser.last_phase.processSpaceCharacters(data)
end
def processCharacters(data)
parse_error(_('Unexpected non-space characters. Expected end of file.'))
@parser.phase = @parser.last_phase
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
parse_error(_('Unexpected start tag (#{name}). Expected end of file.'))
@parser.phase = @parser.last_phase
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
parse_error(_('Unexpected end tag (#{name}). Expected end of file.'))
@parser.phase = @parser.last_phase
@parser.phase.processEndTag(name)
end
end
end

View file

@ -1,7 +1,7 @@
require 'stringio' require 'stringio'
require 'html5lib/constants' require 'html5/constants'
module HTML5lib module HTML5
# Provides a unicode stream of characters to the HTMLTokenizer. # Provides a unicode stream of characters to the HTMLTokenizer.
@ -10,7 +10,7 @@ module HTML5lib
class HTMLInputStream class HTMLInputStream
attr_accessor :queue, :char_encoding attr_accessor :queue, :char_encoding, :errors
# Initialises the HTMLInputStream. # Initialises the HTMLInputStream.
# #
@ -31,7 +31,7 @@ module HTML5lib
@parse_meta = true @parse_meta = true
@chardet = true @chardet = true
options.each { |name, value| instance_variable_set("@#{name}", value) } options.each {|name, value| instance_variable_set("@#{name}", value) }
# Raw Stream # Raw Stream
@raw_stream = open_stream(source) @raw_stream = open_stream(source)
@ -40,25 +40,31 @@ module HTML5lib
#Number of bytes to use when looking for a meta element with #Number of bytes to use when looking for a meta element with
#encoding information #encoding information
@NUM_BYTES_META = 512 @NUM_BYTES_META = 512
#Number of bytes to use when using detecting encoding using chardet
@NUM_BYTES_CHARDET = 256
#Number of bytes to use when reading content
@NUM_BYTES_BUFFER = 1024
#Encoding to use if no other information can be found #Encoding to use if no other information can be found
@DEFAULT_ENCODING = 'windows-1252' @DEFAULT_ENCODING = 'windows-1252'
#Detect encoding iff no explicit "transport level" encoding is supplied #Detect encoding iff no explicit "transport level" encoding is supplied
if @encoding.nil? or not HTML5lib.is_valid_encoding(@encoding) if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
@char_encoding = detect_encoding @char_encoding = detect_encoding
else else
@char_encoding = @encoding @char_encoding = @encoding
end end
# Read bytes from stream decoding them into Unicode # Read bytes from stream decoding them into Unicode
uString = @raw_stream.read @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
if @char_encoding == 'windows-1252' if @char_encoding == 'windows-1252'
@win1252 = true @win1252 = true
elsif @char_encoding != 'utf-8' elsif @char_encoding != 'utf-8'
begin begin
require 'iconv' require 'iconv'
begin begin
uString = Iconv.iconv('utf-8', @char_encoding, uString).first @buffer << @raw_stream.read unless @raw_stream.eof?
@buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
rescue rescue
@win1252 = true @win1252 = true
end end
@ -67,10 +73,8 @@ module HTML5lib
end end
end end
# Convert the unicode string into a list to be used as the data stream
@data_stream = uString
@queue = [] @queue = []
@errors = []
# Reset position in the list to read from # Reset position in the list to read from
@tell = 0 @tell = 0
@ -109,9 +113,22 @@ module HTML5lib
begin begin
require 'rubygems' require 'rubygems'
require 'UniversalDetector' # gem install chardet require 'UniversalDetector' # gem install chardet
buffer = @raw_stream.read buffers = []
encoding = UniversalDetector::chardet(buffer)['encoding'] detector = UniversalDetector::Detector.instance
seek(buffer, 0) detector.reset
until @raw_stream.eof?
buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
break if !buffer or buffer.empty?
buffers << buffer
detector.feed(buffer)
break if detector.instance_eval {@done}
detector.instance_eval {
@_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
}
end
detector.close
encoding = detector.result['encoding']
seek(buffers*'', 0)
rescue LoadError rescue LoadError
end end
end end
@ -242,14 +259,20 @@ module HTML5lib
unless @queue.empty? unless @queue.empty?
return @queue.shift return @queue.shift
else else
c = @data_stream[@tell] if @tell + 3 > @buffer.length and !@raw_stream.eof?
# read next block
@buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
@tell = 0
end
c = @buffer[@tell]
@tell += 1 @tell += 1
case c case c
when 0x01 .. 0x7F when 0x01 .. 0x7F
if c == 0x0D if c == 0x0D
# normalize newlines # normalize newlines
@tell += 1 if @data_stream[@tell] == 0x0A @tell += 1 if @buffer[@tell] == 0x0A
c = 0x0A c = 0x0A
end end
@ -274,9 +297,9 @@ module HTML5lib
end end
when 0xC0 .. 0xFF when 0xC0 .. 0xFF
if @win1252 if instance_variables.include?("@win1252") && @win1252
"\xC3" + (c-64).chr # convert to utf-8 "\xC3" + (c-64).chr # convert to utf-8
elsif @data_stream[@tell-1 .. -1] =~ /^ elsif @buffer[@tell-1 .. @tell+3] =~ /^
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte ( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
@ -292,6 +315,8 @@ module HTML5lib
end end
when 0x00 when 0x00
@errors.push('null character found in input stream, ' +
'replaced with U+FFFD')
[0xFFFD].pack('U') # null characters are invalid [0xFFFD].pack('U') # null characters are invalid
else else
@ -317,6 +342,10 @@ module HTML5lib
@queue.insert(0, c) unless c == :EOF @queue.insert(0, c) unless c == :EOF
return char_stack.join('') return char_stack.join('')
end end
def unget(characters)
@queue.unshift(*characters.to_a) unless characters == :EOF
end
end end
# String-like object with an assosiated position and various extra methods # String-like object with an assosiated position and various extra methods
@ -433,14 +462,14 @@ module HTML5lib
if attr[0] == 'charset' if attr[0] == 'charset'
tentative_encoding = attr[1] tentative_encoding = attr[1]
if HTML5lib.is_valid_encoding(tentative_encoding) if HTML5.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding @encoding = tentative_encoding
return false return false
end end
elsif attr[0] == 'content' elsif attr[0] == 'content'
content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1])) content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
tentative_encoding = content_parser.parse tentative_encoding = content_parser.parse
if HTML5lib.is_valid_encoding(tentative_encoding) if HTML5.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding @encoding = tentative_encoding
return false return false
end end

View file

@ -11,10 +11,10 @@
# #
# @@TODO: # @@TODO:
# * Selectively lowercase only XHTML, but not foreign markup # * Selectively lowercase only XHTML, but not foreign markup
require 'html5lib/html5parser' require 'html5/html5parser'
require 'html5lib/constants' require 'html5/constants'
module HTML5lib module HTML5
# liberal XML parser # liberal XML parser
class XMLParser < HTMLParser class XMLParser < HTMLParser
@ -24,26 +24,36 @@ module HTML5lib
@phases[:initial] = XmlRootPhase.new(self, @tree) @phases[:initial] = XmlRootPhase.new(self, @tree)
end end
def normalizeToken(token) def normalize_token(token)
if token[:type] == :StartTag or token[:type] == :EmptyTag case token[:type]
when :StartTag, :EmptyTag
# We need to remove the duplicate attributes and convert attributes # We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} # to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
token[:data] = Hash[*token[:data].reverse.flatten] token[:data] = Hash[*token[:data].reverse.flatten]
# For EmptyTags, process both a Start and an End tag # For EmptyTags, process both a Start and an End tag
if token[:type] == :EmptyTag if token[:type] == :EmptyTag
save = @tokenizer.content_model_flag
@phase.processStartTag(token[:name], token[:data]) @phase.processStartTag(token[:name], token[:data])
@tokenizer.content_model_flag = save
token[:data] = {} token[:data] = {}
token[:type] = :EndTag token[:type] = :EndTag
end end
elsif token[:type] == :EndTag when :Characters
if token[:data] # un-escape RCDATA_ELEMENTS (e.g. style, script)
parseError(_("End tag contains unexpected attributes.")) if @tokenizer.content_model_flag == :CDATA
token[:data] = token[:data].
gsub('&lt;','<').gsub('&gt;','>').gsub('&amp;','&')
end end
elsif token[:type] == :Comment when :EndTag
if token[:data]
parse_error(_("End tag contains unexpected attributes."))
end
when :Comment
# Rescue CDATA from the comments # Rescue CDATA from the comments
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]" if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
token[:type] = :Characters token[:type] = :Characters
@ -64,22 +74,22 @@ module HTML5lib
@phases[:rootElement] = XhmlRootPhase.new(self, @tree) @phases[:rootElement] = XhmlRootPhase.new(self, @tree)
end end
def normalizeToken(token) def normalize_token(token)
super(token) super(token)
# ensure that non-void XHTML elements have content so that separate # ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted # open and close tags are emitted
if token[:type] == :EndTag if token[:type] == :EndTag
if VOID_ELEMENTS.include? token[:name] if VOID_ELEMENTS.include? token[:name]
if @tree.openElements[-1].name != token["name"]: if @tree.open_elements[-1].name != token["name"]:
token[:type] = :EmptyTag token[:type] = :EmptyTag
token["data"] ||= {} token["data"] ||= {}
end end
else else
if token[:name] == @tree.openElements[-1].name and \ if token[:name] == @tree.open_elements[-1].name and \
not @tree.openElements[-1].hasContent not @tree.open_elements[-1].hasContent
@tree.insertText('') unless @tree.insertText('') unless
@tree.openElements.any? {|e| @tree.open_elements.any? {|e|
e.attributes.keys.include? 'xmlns' and e.attributes.keys.include? 'xmlns' and
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml' e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
} }
@ -92,9 +102,9 @@ module HTML5lib
end end
class XhmlRootPhase < RootElementPhase class XhmlRootPhase < RootElementPhase
def insertHtmlElement def insert_html_element
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'}) element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
@tree.openElements.push(element) @tree.open_elements.push(element)
@tree.document.appendChild(element) @tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead] @parser.phase = @parser.phases[:beforeHead]
end end
@ -105,15 +115,15 @@ module HTML5lib
@start_tag_handlers = Hash.new(:startTagOther) @start_tag_handlers = Hash.new(:startTagOther)
@end_tag_handlers = Hash.new(:endTagOther) @end_tag_handlers = Hash.new(:endTagOther)
def startTagOther(name, attributes) def startTagOther(name, attributes)
@tree.openElements.push(@tree.document) @tree.open_elements.push(@tree.document)
element = @tree.createElement(name, attributes) element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element) @tree.open_elements[-1].appendChild(element)
@tree.openElements.push(element) @tree.open_elements.push(element)
@parser.phase = XmlElementPhase.new(@parser,@tree) @parser.phase = XmlElementPhase.new(@parser,@tree)
end end
def endTagOther(name) def endTagOther(name)
super super
@tree.openElements.pop @tree.open_elements.pop
end end
end end
@ -125,17 +135,17 @@ module HTML5lib
def startTagOther(name, attributes) def startTagOther(name, attributes)
element = @tree.createElement(name, attributes) element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element) @tree.open_elements[-1].appendChild(element)
@tree.openElements.push(element) @tree.open_elements.push(element)
end end
def endTagOther(name) def endTagOther(name)
for node in @tree.openElements.reverse for node in @tree.open_elements.reverse
if node.name == name if node.name == name
{} while @tree.openElements.pop != node {} while @tree.open_elements.pop != node
break break
else else
@parser.parseError parse_error
end end
end end
end end

View file

@ -1,6 +1,7 @@
require 'cgi' require 'cgi'
require 'html5/tokenizer'
module HTML5lib module HTML5
# This module provides sanitization of XHTML+MathML+SVG # This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes. # and of inline style attributes.
@ -12,7 +13,7 @@ module HTML5lib
# or, if you already have a parse tree (in this example, a REXML tree), # or, if you already have a parse tree (in this example, a REXML tree),
# at the Serializer stage: # at the Serializer stage:
# #
# tokens = TreeWalkers.getTreeWalker('rexml').new(tree) # tokens = TreeWalkers.get_tree_walker('rexml').new(tree)
# HTMLSerializer.serialize(tokens, {:encoding=>'utf-8', # HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
# :sanitize => true}) # :sanitize => true})

View file

@ -0,0 +1,2 @@
require 'html5/serializer/htmlserializer'
require 'html5/serializer/xhtmlserializer'

View file

@ -1,6 +1,6 @@
require 'html5lib/constants' require 'html5/constants'
module HTML5lib module HTML5
class HTMLSerializer class HTMLSerializer
@ -21,6 +21,7 @@ module HTML5lib
@use_trailing_solidus = false @use_trailing_solidus = false
@space_before_trailing_solidus = true @space_before_trailing_solidus = true
@escape_lt_in_attrs = false @escape_lt_in_attrs = false
@escape_rcdata = false
@omit_optional_tags = true @omit_optional_tags = true
@sanitize = false @sanitize = false
@ -43,22 +44,22 @@ module HTML5lib
@errors = [] @errors = []
if encoding and @inject_meta_charset if encoding and @inject_meta_charset
require 'html5lib/filters/inject_meta_charset' require 'html5/filters/inject_meta_charset'
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding) treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
end end
if @strip_whitespace if @strip_whitespace
require 'html5lib/filters/whitespace' require 'html5/filters/whitespace'
treewalker = Filters::WhitespaceFilter.new(treewalker) treewalker = Filters::WhitespaceFilter.new(treewalker)
end end
if @sanitize if @sanitize
require 'html5lib/filters/sanitizer' require 'html5/filters/sanitizer'
treewalker = Filters::HTMLSanitizeFilter.new(treewalker) treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
end end
if @omit_optional_tags if @omit_optional_tags
require 'html5lib/filters/optionaltags' require 'html5/filters/optionaltags'
treewalker = Filters::OptionalTagFilter.new(treewalker) treewalker = Filters::OptionalTagFilter.new(treewalker)
end end
@ -72,7 +73,7 @@ module HTML5lib
elsif [:Characters, :SpaceCharacters].include? type elsif [:Characters, :SpaceCharacters].include? type
if type == :SpaceCharacters or in_cdata if type == :SpaceCharacters or in_cdata
if in_cdata and token[:data].include?("</") if in_cdata and token[:data].include?("</")
serializeError(_("Unexpected </ in CDATA")) serialize_error(_("Unexpected </ in CDATA"))
end end
result << token[:data] result << token[:data]
else else
@ -81,10 +82,10 @@ module HTML5lib
elsif [:StartTag, :EmptyTag].include? type elsif [:StartTag, :EmptyTag].include? type
name = token[:name] name = token[:name]
if RCDATA_ELEMENTS.include?(name) if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata
in_cdata = true in_cdata = true
elsif in_cdata elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element")) serialize_error(_("Unexpected child element of a CDATA element"))
end end
attributes = [] attributes = []
for k,v in attrs = token[:data].to_a.sort for k,v in attrs = token[:data].to_a.sort
@ -136,19 +137,19 @@ module HTML5lib
if RCDATA_ELEMENTS.include?(name) if RCDATA_ELEMENTS.include?(name)
in_cdata = false in_cdata = false
elsif in_cdata elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element")) serialize_error(_("Unexpected child element of a CDATA element"))
end end
end_tag = "</#{name}>" end_tag = "</#{name}>"
result << end_tag result << end_tag
elsif type == :Comment elsif type == :Comment
data = token[:data] data = token[:data]
serializeError(_("Comment contains --")) if data.index("--") serialize_error(_("Comment contains --")) if data.index("--")
comment = "<!--%s-->" % token[:data] comment = "<!--%s-->" % token[:data]
result << comment result << comment
else else
serializeError(token[:data]) serialize_error(token[:data])
end end
end end
@ -162,13 +163,15 @@ module HTML5lib
alias :render :serialize alias :render :serialize
def serializeError(data="XXX ERROR MESSAGE NEEDED") def serialize_error(data="XXX ERROR MESSAGE NEEDED")
# XXX The idea is to make data mandatory. # XXX The idea is to make data mandatory.
@errors.push(data) @errors.push(data)
if @strict if @strict
raise SerializeError raise SerializeError
end end
end end
def _(string); string; end
end end
# Error in serialized tree # Error in serialized tree

View file

@ -0,0 +1,20 @@
require 'html5/serializer/htmlserializer'
module HTML5
class XHTMLSerializer < HTMLSerializer
DEFAULTS = {
:quote_attr_values => true,
:minimize_boolean_attributes => false,
:use_trailing_solidus => true,
:escape_lt_in_attrs => true,
:omit_optional_tags => false,
:escape_rcdata => true
}
def initialize(options={})
super(DEFAULTS.clone.update(options))
end
end
end

View file

@ -0,0 +1,45 @@
module HTML5
module Sniffer
# 4.7.4
def html_or_feed str
s = str[0, 512] # steps 1, 2
pos = 0
while pos < s.length
case s[pos]
when 0x09, 0x20, 0x0A, 0x0D # tab, space, LF, CR
pos += 1
when 0x3C # "<"
pos += 1
if s[pos..pos+2] == "!--" # [0x21, 0x2D, 0x2D]
pos += 3
until s[pos..pos+2] == "-->" or pos >= s.length
pos += 1
end
pos += 3
elsif s[pos] == 0x21 # "!"
pos += 1
until s[pos] == 0x3E or pos >= s.length # ">"
pos += 1
end
pos += 1
elsif s[pos] == 0x3F # "?"
until s[pos..pos+1] == "?>" or pos >= s.length # [0x3F, 0x3E]
pos += 1
end
pos += 2
elsif s[pos..pos+2] == "rss" # [0x72, 0x73, 0x73]
return "application/rss+xml"
elsif s[pos..pos+3] == "feed" # [0x66, 0x65, 0x65, 0x64]
return "application/atom+xml"
elsif s[pos..pos+6] == "rdf:RDF" # [0x72, 0x64, 0x66, 0x3A, 0x52, 0x44, 0x46]
raise NotImplementedError
end
else
break
end
end
"text/html"
end
end
end

View file

@ -0,0 +1,968 @@
require 'html5/constants'
require 'html5/inputstream'
module HTML5
# This class takes care of tokenizing HTML.
#
# * @current_token
# Holds the token that is currently being processed.
#
# * @state
# Holds a reference to the method to be invoked... XXX
#
# * @states
# Holds a mapping between states and methods that implement the state.
#
# * @stream
# Points to HTMLInputStream object.
class HTMLTokenizer
attr_accessor :content_model_flag, :current_token
attr_reader :stream
# XXX need to fix documentation
def initialize(stream, options = {})
@stream = HTMLInputStream.new(stream, options)
# Setup the initial tokenizer state
@content_model_flag = :PCDATA
@state = :data_state
@escapeFlag = false
@lastFourChars = []
# The current token being created
@current_token = nil
# Tokens to be processed.
@token_queue = []
@lowercase_element_name = options[:lowercase_element_name] != false
@lowercase_attr_name = options[:lowercase_attr_name] != false
end
# This is where the magic happens.
#
# We do our usually processing through the states and when we have a token
# to return we yield the token which pauses processing until the next token
# is requested.
def each
@token_queue = []
# Start processing. When EOF is reached @state will return false
# instead of true and the loop will terminate.
while send @state
yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
yield @token_queue.shift until @token_queue.empty?
end
end
# Below are various helper functions the tokenizer states use worked out.
# If the next character is a '>', convert the current_token into
# an EmptyTag
def process_solidus_in_tag
# We need to consume another character to make sure it's a ">"
data = @stream.char
if @current_token[:type] == :StartTag and data == ">"
@current_token[:type] = :EmptyTag
else
@token_queue << {:type => :ParseError, :data => _("Solidus (/) incorrectly placed in tag.")}
end
# The character we just consumed need to be put back on the stack so it
# doesn't get lost...
@stream.unget(data)
end
# This function returns either U+FFFD or the character based on the
# decimal or hexadecimal representation. It also discards ";" if present.
# If not present @token_queue << {:type => :ParseError}" is invoked.
def consume_number_entity(isHex)
# XXX More need to be done here. For instance, #13 should prolly be
# converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
# such. Thoughts on this appreciated.
allowed = DIGITS
radix = 10
if isHex
allowed = HEX_DIGITS
radix = 16
end
char_stack = []
# Consume all the characters that are in range while making sure we
# don't hit an EOF.
c = @stream.char
while allowed.include?(c) and c != :EOF
char_stack.push(c)
c = @stream.char
end
# Convert the set of characters consumed to an int.
charAsInt = char_stack.join('').to_i(radix)
if charAsInt == 13
@token_queue << {:type => :ParseError, :data => _("Incorrect CR newline entity. Replaced with LF.")}
charAsInt = 10
elsif (128..159).include? charAsInt
# If the integer is between 127 and 160 (so 128 and bigger and 159
# and smaller) we need to do the "windows trick".
@token_queue << {:type => :ParseError, :data => _("Entity used with illegal number (windows-1252 reference).")}
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
end
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
char = [charAsInt].pack('U')
else
char = [0xFFFD].pack('U')
@token_queue << {:type => :ParseError, :data => _("Numeric entity represents an illegal codepoint.")}
end
# Discard the ; if present. Otherwise, put it back on the queue and
# invoke parse_error on parser.
if c != ";"
@token_queue << {:type => :ParseError, :data => _("Numeric entity didn't end with ';'.")}
@stream.unget(c)
end
return char
end
def consume_entity(from_attribute=false)
char = nil
char_stack = [@stream.char]
if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0])
@stream.unget(char_stack)
elsif char_stack[0] == '#'
# We might have a number entity here.
char_stack += [@stream.char, @stream.char]
if char_stack[0 .. 1].include? :EOF
# If we reach the end of the file put everything up to :EOF
# back in the queue
char_stack = char_stack[0...char_stack.index(:EOF)]
@stream.unget(char_stack)
@token_queue << {:type => :ParseError, :data => _("Numeric entity expected. Got end of file instead.")}
else
if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
# Hexadecimal entity detected.
@stream.unget(char_stack[2])
char = consume_number_entity(true)
elsif DIGITS.include? char_stack[1]
# Decimal entity detected.
@stream.unget(char_stack[1..-1])
char = consume_number_entity(false)
else
# No number entity detected.
@stream.unget(char_stack)
@token_queue << {:type => :ParseError, :data => _("Numeric entity expected but none found.")}
end
end
else
# At this point in the process might have named entity. Entities
# are stored in the global variable "entities".
#
# Consume characters and compare to these to a substring of the
# entity names in the list until the substring no longer matches.
filteredEntityList = ENTITIES.keys
filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
entityName = nil
# Try to find the longest entity the string will match to take care
# of &noti for instance.
while char_stack.last != :EOF
name = char_stack.join('')
if filteredEntityList.any? {|e| e[0...name.length] == name}
filteredEntityList.reject! {|e| e[0...name.length] != name}
char_stack.push(@stream.char)
else
break
end
if ENTITIES.include? name
entityName = name
break if entityName[-1] == ';'
end
end
if entityName != nil
char = ENTITIES[entityName]
# Check whether or not the last character returned can be
# discarded or needs to be put back.
if entityName[-1] != ?;
@token_queue << {:type => :ParseError, :data => _("Named entity didn't end with ';'.")}
end
if char_stack[-1] != ";" and from_attribute and
(ASCII_LETTERS.include?(char_stack[entityName.length]) or
DIGITS.include?(char_stack[entityName.length]))
@stream.unget(char_stack)
char = '&'
else
@stream.unget(char_stack[entityName.length..-1])
end
else
@token_queue << {:type => :ParseError, :data => _("Named entity expected. Got none.")}
@stream.unget(char_stack)
end
end
return char
end
# This method replaces the need for "entityInAttributeValueState".
def process_entity_in_attribute
entity = consume_entity(true)
if entity
@current_token[:data][-1][1] += entity
else
@current_token[:data][-1][1] += "&"
end
end
# This method is a generic handler for emitting the tags. It also sets
# the state to "data" because that's what's needed after a token has been
# emitted.
def emit_current_token
# Add token to the queue to be yielded
token = @current_token
if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
if @lowercase_element_name
token[:name] = token[:name].downcase
end
@token_queue << token
@state = :data_state
end
end
# Below are the various tokenizer states worked out.
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
# documents to figure out what the order of the various if and elsif
# statements should be.
def data_state
data = @stream.char
if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
@lastFourChars << data
@lastFourChars.shift if @lastFourChars.length > 4
end
if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
@state = :entity_data_state
elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
@escapeFlag = true
@token_queue << {:type => :Characters, :data => data}
elsif data == "<" and !@escapeFlag and
[:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
@state = :tag_open_state
elsif data == ">" and @escapeFlag and
[:CDATA,:RCDATA].include?(@content_model_flag) and
@lastFourChars[1..-1].join('') == "-->"
@escapeFlag = false
@token_queue << {:type => :Characters, :data => data}
elsif data == :EOF
# Tokenization ends.
return false
elsif SPACE_CHARACTERS.include? data
# Directly after emitting a token you switch back to the "data
# state". At that point SPACE_CHARACTERS are important so they are
# emitted separately.
# XXX need to check if we don't need a special "spaces" flag on
# characters.
@token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
else
@token_queue << {:type => :Characters, :data => data + @stream.chars_until(%w[& < > -])}
end
return true
end
def entity_data_state
entity = consume_entity
if entity
@token_queue << {:type => :Characters, :data => entity}
else
@token_queue << {:type => :Characters, :data => "&"}
end
@state = :data_state
return true
end
def tag_open_state
data = @stream.char
if @content_model_flag == :PCDATA
if data == "!"
@state = :markup_declaration_open_state
elsif data == "/"
@state = :close_tag_open_state
elsif data != :EOF and ASCII_LETTERS.include? data
@current_token = {:type => :StartTag, :name => data, :data => []}
@state = :tag_name_state
elsif data == ">"
# XXX In theory it could be something besides a tag name. But
# do we really care?
@token_queue << {:type => :ParseError, :data => _("Expected tag name. Got '>' instead.")}
@token_queue << {:type => :Characters, :data => "<>"}
@state = :data_state
elsif data == "?"
# XXX In theory it could be something besides a tag name. But
# do we really care?
@token_queue.push({:type => :ParseError, :data => _("Expected tag name. Got '?' instead (HTML doesn't " +
"support processing instructions).")})
@stream.unget(data)
@state = :bogus_comment_state
else
# XXX
@token_queue << {:type => :ParseError, :data => _("Expected tag name. Got something else instead")}
@token_queue << {:type => :Characters, :data => "<"}
@stream.unget(data)
@state = :data_state
end
else
# We know the content model flag is set to either RCDATA or CDATA
# now because this state can never be entered with the PLAINTEXT
# flag.
if data == "/"
@state = :close_tag_open_state
else
@token_queue << {:type => :Characters, :data => "<"}
@stream.unget(data)
@state = :data_state
end
end
return true
end
def close_tag_open_state
if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
if @current_token
char_stack = []
# So far we know that "</" has been consumed. We now need to know
# whether the next few characters match the name of last emitted
# start tag which also happens to be the current_token. We also need
# to have the character directly after the characters that could
# match the start tag name.
(@current_token[:name].length + 1).times do
char_stack.push(@stream.char)
# Make sure we don't get hit by :EOF
break if char_stack[-1] == :EOF
end
# Since this is just for checking. We put the characters back on
# the stack.
@stream.unget(char_stack)
end
if @current_token and
@current_token[:name].downcase ==
char_stack[0...-1].join('').downcase and
(SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
# Because the characters are correct we can safely switch to
# PCDATA mode now. This also means we don't have to do it when
# emitting the end tag token.
@content_model_flag = :PCDATA
else
@token_queue << {:type => :Characters, :data => "</"}
@state = :data_state
# Need to return here since we don't want the rest of the
# method to be walked through.
return true
end
end
data = @stream.char
if data == :EOF
@token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected end of file.")}
@token_queue << {:type => :Characters, :data => "</"}
@state = :data_state
elsif ASCII_LETTERS.include? data
@current_token = {:type => :EndTag, :name => data, :data => []}
@state = :tag_name_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => _("Expected closing tag. Got '>' instead. Ignoring '</>'.")}
@state = :data_state
else
# XXX data can be _'_...
@token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected character '#{data}' found.")}
@stream.unget(data)
@state = :bogus_comment_state
end
return true
end
def tag_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = :before_attribute_name_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in the tag name.")}
emit_current_token
elsif ASCII_LETTERS.include? data
@current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
elsif data == ">"
emit_current_token
elsif data == "/"
process_solidus_in_tag
@state = :before_attribute_name_state
else
@current_token[:name] += data
end
return true
end
def before_attribute_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.chars_until(SPACE_CHARACTERS, true)
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute name instead.")}
emit_current_token
elsif ASCII_LETTERS.include? data
@current_token[:data].push([data, ""])
@state = :attribute_name_state
elsif data == ">"
emit_current_token
elsif data == "/"
process_solidus_in_tag
else
@current_token[:data].push([data, ""])
@state = :attribute_name_state
end
return true
end
def attribute_name_state
data = @stream.char
leavingThisState = true
emitToken = false
if data == "="
@state = :before_attribute_value_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute name.")}
@state = :data_state
emitToken = true
elsif ASCII_LETTERS.include? data
@current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
leavingThisState = false
elsif data == ">"
# XXX If we emit here the attributes are converted to a dict
# without being checked and when the code below runs we error
# because data is a dict not a list
emitToken = true
elsif SPACE_CHARACTERS.include? data
@state = :after_attribute_name_state
elsif data == "/"
process_solidus_in_tag
@state = :before_attribute_name_state
else
@current_token[:data][-1][0] += data
leavingThisState = false
end
if leavingThisState
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
if @lowercase_attr_name
@current_token[:data][-1][0] = @current_token[:data].last.first.downcase
end
@current_token[:data][0...-1].each {|name,value|
if @current_token[:data].last.first == name
@token_queue << {:type => :ParseError, :data =>_("Dropped duplicate attribute on tag.")}
break # don't report an error more than once
end
}
# XXX Fix for above XXX
emit_current_token if emitToken
end
return true
end
def after_attribute_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.chars_until(SPACE_CHARACTERS, true)
elsif data == "="
@state = :before_attribute_value_state
elsif data == ">"
emit_current_token
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected = or end of tag.")}
emit_current_token
elsif ASCII_LETTERS.include? data
@current_token[:data].push([data, ""])
@state = :attribute_name_state
elsif data == "/"
process_solidus_in_tag
@state = :before_attribute_name_state
else
@current_token[:data].push([data, ""])
@state = :attribute_name_state
end
return true
end
def before_attribute_value_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.chars_until(SPACE_CHARACTERS, true)
elsif data == "\""
@state = :attribute_value_double_quoted_state
elsif data == "&"
@state = :attribute_value_unquoted_state
@stream.unget(data);
elsif data == "'"
@state = :attribute_value_single_quoted_state
elsif data == ">"
emit_current_token
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute value.")}
emit_current_token
else
@current_token[:data][-1][1] += data
@state = :attribute_value_unquoted_state
end
return true
end
def attribute_value_double_quoted_state
data = @stream.char
if data == "\""
@state = :before_attribute_name_state
elsif data == "&"
process_entity_in_attribute
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (\").")}
emit_current_token
else
@current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
end
return true
end
def attribute_value_single_quoted_state
data = @stream.char
if data == "'"
@state = :before_attribute_name_state
elsif data == "&"
process_entity_in_attribute
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (').")}
emit_current_token
else
@current_token[:data][-1][1] += data +\
@stream.chars_until(["'", "&"])
end
return true
end
def attribute_value_unquoted_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = :before_attribute_name_state
elsif data == "&"
process_entity_in_attribute
elsif data == ">"
emit_current_token
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value.")}
emit_current_token
else
@current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
end
return true
end
def bogus_comment_state
# Make a new comment token and give it as value all the characters
# until the first > or :EOF (chars_until checks for :EOF automatically)
# and emit it.
@token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}
# Eat the character directly after the bogus comment which is either a
# ">" or an :EOF.
@stream.char
@state = :data_state
return true
end
def markup_declaration_open_state
char_stack = [@stream.char, @stream.char]
if char_stack == ["-", "-"]
@current_token = {:type => :Comment, :data => ""}
@state = :comment_start_state
else
5.times { char_stack.push(@stream.char) }
# Put in explicit :EOF check
if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
@current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
@state = :doctype_state
else
@token_queue << {:type => :ParseError, :data => _("Expected '--' or 'DOCTYPE'. Not found.")}
@stream.unget(char_stack)
@state = :bogus_comment_state
end
end
return true
end
def comment_start_state
data = @stream.char
if data == "-"
@state = :comment_start_dash_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
@token_queue << @current_token
@state = :data_state
else
@current_token[:data] += data + @stream.chars_until("-")
@state = :comment_state
end
return true
end
def comment_start_dash_state
data = @stream.char
if data == "-"
@state = :comment_end_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
@token_queue << @current_token
@state = :data_state
else
@current_token[:data] += '-' + data + @stream.chars_until("-")
@state = :comment_state
end
return true
end
def comment_state
data = @stream.char
if data == "-"
@state = :comment_end_dash_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
@token_queue << @current_token
@state = :data_state
else
@current_token[:data] += data + @stream.chars_until("-")
end
return true
end
def comment_end_dash_state
data = @stream.char
if data == "-"
@state = :comment_end_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (-)")}
@token_queue << @current_token
@state = :data_state
else
@current_token[:data] += "-" + data +\
@stream.chars_until("-")
# Consume the next character which is either a "-" or an :EOF as
# well so if there's a "-" directly after the "-" we go nicely to
# the "comment end state" without emitting a ParseError there.
@stream.char
end
return true
end
def comment_end_state
data = @stream.char
if data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == "-"
@token_queue << {:type => :ParseError, :data => _("Unexpected '-' after '--' found in comment.")}
@current_token[:data] += data
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (--).")}
@token_queue << @current_token
@state = :data_state
else
# XXX
@token_queue << {:type => :ParseError, :data => _("Unexpected character in comment found.")}
@current_token[:data] += "--" + data
@state = :comment_state
end
return true
end
def doctype_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = :before_doctype_name_state
else
@token_queue << {:type => :ParseError, :data => _("No space after literal string 'DOCTYPE'.")}
@stream.unget(data)
@state = :before_doctype_name_state
end
return true
end
def before_doctype_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
elsif data == ">"
@token_queue << {:type => :ParseError, :data => _("Unexpected > character. Expected DOCTYPE name.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected DOCTYPE name.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:name] = data
@state = :doctype_name_state
end
return true
end
def doctype_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = :after_doctype_name_state
elsif data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE name.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:name] += data
end
return true
end
def after_doctype_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
elsif data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@current_token[:correct] = false
@stream.unget(data)
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@token_queue << @current_token
@state = :data_state
else
char_stack = [data]
5.times { char_stack << stream.char }
token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
if token == "public" and !char_stack.include?(:EOF)
@state = :before_doctype_public_identifier_state
elsif token == "system" and !char_stack.include?(:EOF)
@state = :before_doctype_system_identifier_state
else
@stream.unget(char_stack)
@token_queue << {:type => :ParseError, :data => _("Expected 'public' or 'system'. Got '#{token}'")}
@state = :bogus_doctype_state
end
end
return true
end
def before_doctype_public_identifier_state
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@current_token[:publicId] = ""
@state = :doctype_public_identifier_double_quoted_state
elsif data == "'"
@current_token[:publicId] = ""
@state = :doctype_public_identifier_single_quoted_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => _("Unexpected end of DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
@state = :bogus_doctype_state
end
return true
end
def doctype_public_identifier_double_quoted_state
data = @stream.char
if data == "\""
@state = :after_doctype_public_identifier_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:publicId] += data
end
return true
end
def doctype_public_identifier_single_quoted_state
data = @stream.char
if data == "'"
@state = :after_doctype_public_identifier_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:publicId] += data
end
return true
end
def after_doctype_public_identifier_state
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@current_token[:systemId] = ""
@state = :doctype_system_identifier_double_quoted_state
elsif data == "'"
@current_token[:systemId] = ""
@state = :doctype_system_identifier_single_quoted_state
elsif data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
@state = :bogus_doctype_state
end
return true
end
def before_doctype_system_identifier_state
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@current_token[:systemId] = ""
@state = :doctype_system_identifier_double_quoted_state
elsif data == "'"
@current_token[:systemId] = ""
@state = :doctype_system_identifier_single_quoted_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
@state = :bogus_doctype_state
end
return true
end
def doctype_system_identifier_double_quoted_state
data = @stream.char
if data == "\""
@state = :after_doctype_system_identifier_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:systemId] += data
end
return true
end
def doctype_system_identifier_single_quoted_state
data = @stream.char
if data == "'"
@state = :after_doctype_system_identifier_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:systemId] += data
end
return true
end
def after_doctype_system_identifier_state
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
@state = :bogus_doctype_state
end
return true
end
def bogus_doctype_state
data = @stream.char
@current_token[:correct] = false
if data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
# XXX EMIT
@stream.unget(data)
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in bogus doctype.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
end
return true
end
def _(string); string; end
end
end

View file

@ -1,24 +1,24 @@
module HTML5lib module HTML5
module TreeBuilders module TreeBuilders
class << self class << self
def [](name) def [](name)
case name.to_s.downcase case name.to_s.downcase
when 'simpletree' then when 'simpletree' then
require 'html5lib/treebuilders/simpletree' require 'html5/treebuilders/simpletree'
SimpleTree::TreeBuilder SimpleTree::TreeBuilder
when 'rexml' then when 'rexml' then
require 'html5lib/treebuilders/rexml' require 'html5/treebuilders/rexml'
REXML::TreeBuilder REXML::TreeBuilder
when 'hpricot' then when 'hpricot' then
require 'html5lib/treebuilders/hpricot' require 'html5/treebuilders/hpricot'
Hpricot::TreeBuilder Hpricot::TreeBuilder
else else
raise "Unknown TreeBuilder #{name}" raise "Unknown TreeBuilder #{name}"
end end
end end
alias :getTreeBuilder :[] alias :get_tree_builder :[]
end end
end end
end end

View file

@ -1,8 +1,8 @@
require 'html5lib/constants' require 'html5/constants'
#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like #XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
module HTML5lib module HTML5
# The scope markers are inserted when entering buttons, object elements, # The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting # marquees, table cells, and table captions, and are used to prevent formatting
@ -76,13 +76,13 @@ module HTML5lib
# Base treebuilder implementation # Base treebuilder implementation
class TreeBuilder class TreeBuilder
attr_accessor :openElements attr_accessor :open_elements
attr_accessor :activeFormattingElements attr_accessor :activeFormattingElements
attr_accessor :document attr_accessor :document
attr_accessor :headPointer attr_accessor :head_pointer
attr_accessor :formPointer attr_accessor :formPointer
@ -106,25 +106,25 @@ module HTML5lib
end end
def reset def reset
@openElements = [] @open_elements = []
@activeFormattingElements = [] @activeFormattingElements = []
#XXX - rename these to headElement, formElement #XXX - rename these to headElement, formElement
@headPointer = nil @head_pointer = nil
@formPointer = nil @formPointer = nil
self.insertFromTable = false self.insert_from_table = false
@document = @documentClass.new @document = @documentClass.new
end end
def elementInScope(target, tableVariant=false) def elementInScope(target, tableVariant=false)
# Exit early when possible. # Exit early when possible.
return true if @openElements[-1].name == target return true if @open_elements[-1].name == target
# AT How about while true and simply set node to [-1] and set it to # AT How about while true and simply set node to [-1] and set it to
# [-2] at the end... # [-2] at the end...
@openElements.reverse.each do |element| @open_elements.reverse.each do |element|
if element.name == target if element.name == target
return true return true
elsif element.name == 'table' elsif element.name == 'table'
@ -149,10 +149,10 @@ module HTML5lib
# Step 2 and step 3: we start with the last element. So i is -1. # Step 2 and step 3: we start with the last element. So i is -1.
i = -1 i = -1
entry = @activeFormattingElements[i] entry = @activeFormattingElements[i]
return if entry == Marker or @openElements.include?(entry) return if entry == Marker or @open_elements.include?(entry)
# Step 6 # Step 6
until entry == Marker or @openElements.include?(entry) until entry == Marker or @open_elements.include?(entry)
# Step 5: let entry be one earlier in the list. # Step 5: let entry be one earlier in the list.
i -= 1 i -= 1
begin begin
@ -171,7 +171,7 @@ module HTML5lib
clone = @activeFormattingElements[i].cloneNode clone = @activeFormattingElements[i].cloneNode
# Step 9 # Step 9
element = insertElement(clone.name, clone.attributes) element = insert_element(clone.name, clone.attributes)
# Step 10 # Step 10
@activeFormattingElements[i] = element @activeFormattingElements[i] = element
@ -198,12 +198,15 @@ module HTML5lib
return false return false
end end
def insertDoctype(name) def insertDoctype(name, public_id, system_id)
@document.appendChild(@doctypeClass.new(name)) doctype = @doctypeClass.new(name)
doctype.public_id = public_id
doctype.system_id = system_id
@document.appendChild(doctype)
end end
def insertComment(data, parent=nil) def insert_comment(data, parent=nil)
parent = @openElements[-1] if parent.nil? parent = @open_elements[-1] if parent.nil?
parent.appendChild(@commentClass.new(data)) parent.appendChild(@commentClass.new(data))
end end
@ -216,28 +219,28 @@ module HTML5lib
# Switch the function used to insert an element from the # Switch the function used to insert an element from the
# normal one to the misnested table one and back again # normal one to the misnested table one and back again
def insertFromTable=(value) def insert_from_table=(value)
@insertFromTable = value @insert_from_table = value
@insertElement = value ? :insertElementTable : :insertElementNormal @insert_element = value ? :insert_elementTable : :insert_elementNormal
end end
def insertElement(name, attributes) def insert_element(name, attributes)
send(@insertElement, name, attributes) send(@insert_element, name, attributes)
end end
def insertElementNormal(name, attributes) def insert_elementNormal(name, attributes)
element = @elementClass.new(name) element = @elementClass.new(name)
element.attributes = attributes element.attributes = attributes
@openElements[-1].appendChild(element) @open_elements.last.appendChild(element)
@openElements.push(element) @open_elements.push(element)
return element return element
end end
# Create an element and insert it into the tree # Create an element and insert it into the tree
def insertElementTable(name, attributes) def insert_elementTable(name, attributes)
element = @elementClass.new(name) element = @elementClass.new(name)
element.attributes = attributes element.attributes = attributes
if TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name) if TABLE_INSERT_MODE_ELEMENTS.include?(@open_elements.last.name)
#We should be in the InTable mode. This means we want to do #We should be in the InTable mode. This means we want to do
#special magic element rearranging #special magic element rearranging
parent, insertBefore = getTableMisnestedNodePosition parent, insertBefore = getTableMisnestedNodePosition
@ -246,17 +249,17 @@ module HTML5lib
else else
parent.insertBefore(element, insertBefore) parent.insertBefore(element, insertBefore)
end end
@openElements.push(element) @open_elements.push(element)
else else
return insertElementNormal(name, attributes) return insert_elementNormal(name, attributes)
end end
return element return element
end end
def insertText(data, parent=nil) def insertText(data, parent=nil)
parent = @openElements[-1] if parent.nil? parent = @open_elements[-1] if parent.nil?
if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name))) if (not(@insert_from_table) or (@insert_from_table and not TABLE_INSERT_MODE_ELEMENTS.include?(@open_elements[-1].name)))
parent.insertText(data) parent.insertText(data)
else else
#We should be in the InTable mode. This means we want to do #We should be in the InTable mode. This means we want to do
@ -275,7 +278,7 @@ module HTML5lib
lastTable = nil lastTable = nil
fosterParent = nil fosterParent = nil
insertBefore = nil insertBefore = nil
@openElements.reverse.each do |element| @open_elements.reverse.each do |element|
if element.name == "table" if element.name == "table"
lastTable = element lastTable = element
break break
@ -288,33 +291,34 @@ module HTML5lib
fosterParent = lastTable.parent fosterParent = lastTable.parent
insertBefore = lastTable insertBefore = lastTable
else else
fosterParent = @openElements[@openElements.index(lastTable) - 1] fosterParent = @open_elements[@open_elements.index(lastTable) - 1]
end end
else else
fosterParent = @openElements[0] fosterParent = @open_elements[0]
end end
return fosterParent, insertBefore return fosterParent, insertBefore
end end
def generateImpliedEndTags(exclude=nil) def generateImpliedEndTags(exclude=nil)
name = @openElements[-1].name name = @open_elements[-1].name
if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude) # XXX td, th and tr are not actually needed
@openElements.pop if (%w[dd dt li p td th tr].include?(name) and name != exclude)
@open_elements.pop
# XXX This is not entirely what the specification says. We should # XXX This is not entirely what the specification says. We should
# investigate it more closely. # investigate it more closely.
generateImpliedEndTags(exclude) generateImpliedEndTags(exclude)
end end
end end
def getDocument def get_document
@document @document
end end
def getFragment def get_fragment
#assert @innerHTML #assert @inner_html
fragment = @fragmentClass.new fragment = @fragmentClass.new
@openElements[0].reparentChildren(fragment) @open_elements[0].reparentChildren(fragment)
return fragment return fragment
end end

View file

@ -1,14 +1,13 @@
require 'html5lib/treebuilders/base' require 'html5/treebuilders/base'
require 'rubygems' require 'rubygems'
require 'hpricot' require 'hpricot'
require 'forwardable' require 'forwardable'
module HTML5lib module HTML5
module TreeBuilders module TreeBuilders
module Hpricot module Hpricot
class Node < Base::Node class Node < Base::Node
extend Forwardable extend Forwardable
def_delegators :@hpricot, :name def_delegators :@hpricot, :name
@ -22,7 +21,7 @@ module HTML5lib
def appendChild(node) def appendChild(node)
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode) if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s childNodes.last.hpricot.content = childNodes.last.hpricot.content + node.hpricot.content
else else
childNodes << node childNodes << node
hpricot.children << node.hpricot hpricot.children << node.hpricot
@ -145,21 +144,27 @@ module HTML5lib
end end
class DocumentType < Node class DocumentType < Node
def_delegators :@hpricot, :public_id, :system_id
def self.hpricot_class def self.hpricot_class
::Hpricot::DocType ::Hpricot::DocType
end end
def initialize(name) def initialize(name, public_id, system_id)
begin begin
super(name) super(name)
rescue ArgumentError # needs 3... rescue ArgumentError # needs 3...
end end
@hpricot = ::Hpricot::DocType.new(name, nil, nil) @hpricot = ::Hpricot::DocType.new(name, public_id, system_id)
end end
def printTree(indent=0) def printTree(indent=0)
if hpricot.target and hpricot.target.any?
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>" "\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
else
"\n|#{' ' * indent}<!DOCTYPE >"
end
end end
end end
@ -169,7 +174,7 @@ module HTML5lib
end end
def printTree(indent=0) def printTree(indent=0)
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) } childNodes.inject('') {|tree, child| tree + child.printTree(indent + 2) }
end end
end end
@ -202,15 +207,20 @@ module HTML5lib
@fragmentClass = DocumentFragment @fragmentClass = DocumentFragment
end end
def insertDoctype(name, public_id, system_id)
doctype = @doctypeClass.new(name, public_id, system_id)
@document.appendChild(doctype)
end
def testSerializer(node) def testSerializer(node)
node.printTree node.printTree
end end
def getDocument def get_document
@document.hpricot @document.hpricot
end end
def getFragment def get_fragment
@document = super @document = super
return @document.hpricot.children return @document.hpricot.children
end end

View file

@ -1,8 +1,8 @@
require 'html5lib/treebuilders/base' require 'html5/treebuilders/base'
require 'rexml/document' require 'rexml/document'
require 'forwardable' require 'forwardable'
module HTML5lib module HTML5
module TreeBuilders module TreeBuilders
module REXML module REXML
@ -17,11 +17,9 @@ module HTML5lib
end end
def appendChild node def appendChild node
if node.kind_of? TextNode and if node.kind_of?(TextNode) && childNodes.length > 0 && childNodes.last.kind_of?(TextNode)
childNodes.length>0 and childNodes[-1].kind_of? TextNode childNodes.last.rxobj.value = childNodes.last.rxobj.to_s + node.rxobj.to_s
childNodes[-1].rxobj.value = childNodes.last.rxobj.raw = true
childNodes[-1].rxobj.to_s + node.rxobj.to_s
childNodes[-1].rxobj.raw = true
else else
childNodes.push node childNodes.push node
rxobj.add node.rxobj rxobj.add node.rxobj
@ -45,10 +43,8 @@ module HTML5lib
def insertBefore node, refNode def insertBefore node, refNode
index = childNodes.index(refNode) index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and if node.kind_of?(TextNode) and index > 0 && childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].kind_of? TextNode childNodes[index-1].rxobj.value = childNodes[index-1].rxobj.to_s + node.rxobj.to_s
childNodes[index-1].rxobj.value =
childNodes[index-1].rxobj.to_s + node.rxobj.to_s
childNodes[index-1].rxobj.raw = true childNodes[index-1].rxobj.raw = true
else else
childNodes.insert index, node childNodes.insert index, node
@ -57,7 +53,7 @@ module HTML5lib
end end
def hasContent def hasContent
return (childNodes.length > 0) (childNodes.length > 0)
end end
end end
@ -77,7 +73,7 @@ module HTML5lib
end end
def attributes= value def attributes= value
value.each {|name, value| rxobj.attributes[name]=value} value.each {|name, value| rxobj.attributes[name] = value}
end end
def printTree indent=0 def printTree indent=0
@ -90,7 +86,7 @@ module HTML5lib
for child in childNodes for child in childNodes
tree += child.printTree(indent) tree += child.printTree(indent)
end end
return tree tree
end end
end end
@ -120,10 +116,25 @@ module HTML5lib
end end
class DocumentType < Node class DocumentType < Node
def_delegator :@rxobj, :public, :public_id
def_delegator :@rxobj, :system, :system_id
def self.rxclass def self.rxclass
::REXML::DocType ::REXML::DocType
end end
def initialize name, public_id, system_id
super(name)
if public_id
@rxobj = ::REXML::DocType.new [name, ::REXML::DocType::PUBLIC, public_id, system_id]
elsif system_id
@rxobj = ::REXML::DocType.new [name, ::REXML::DocType::SYSTEM, nil, system_id]
else
@rxobj = ::REXML::DocType.new name
end
end
def printTree indent=0 def printTree indent=0
"\n|#{' ' * indent}<!DOCTYPE #{name}>" "\n|#{' ' * indent}<!DOCTYPE #{name}>"
end end
@ -145,7 +156,7 @@ module HTML5lib
class TextNode < Node class TextNode < Node
def initialize data def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;') raw = data.gsub('&', '&amp;').gsub('<', '&lt;').gsub('>', '&gt;')
@rxobj = ::REXML::Text.new(raw, true, nil, true) @rxobj = ::REXML::Text.new(raw, true, nil, true)
end end
@ -173,15 +184,20 @@ module HTML5lib
@fragmentClass = DocumentFragment @fragmentClass = DocumentFragment
end end
def testSerializer node def insertDoctype(name, public_id, system_id)
node.printTree() doctype = @doctypeClass.new(name, public_id, system_id)
@document.appendChild(doctype)
end end
def getDocument def testSerializer node
node.printTree
end
def get_document
@document.rxobj @document.rxobj
end end
def getFragment def get_fragment
@document = super @document = super
return @document.rxobj.children return @document.rxobj.children
end end

View file

@ -1,6 +1,6 @@
require 'html5lib/treebuilders/base' require 'html5/treebuilders/base'
module HTML5lib module HTML5
module TreeBuilders module TreeBuilders
module SimpleTree module SimpleTree
@ -25,10 +25,10 @@ module HTML5lib
def appendChild node def appendChild node
if node.kind_of? TextNode and if node.kind_of? TextNode and
childNodes.length>0 and childNodes[-1].kind_of? TextNode childNodes.length > 0 and childNodes.last.kind_of? TextNode
childNodes[-1].value += node.value childNodes.last.value += node.value
else else
childNodes.push node childNodes << node
end end
node.parent = self node.parent = self
end end
@ -55,8 +55,7 @@ module HTML5lib
def insertBefore node, refNode def insertBefore node, refNode
index = childNodes.index(refNode) index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and if node.kind_of?(TextNode) && index > 0 && childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].kind_of? TextNode
childNodes[index-1].value += node.value childNodes[index-1].value += node.value
else else
childNodes.insert index, node childNodes.insert index, node
@ -72,7 +71,7 @@ module HTML5lib
end end
def hasContent def hasContent
return (childNodes.length > 0) childNodes.length > 0
end end
end end
@ -90,7 +89,7 @@ module HTML5lib
for child in childNodes for child in childNodes
tree += child.printTree(indent) tree += child.printTree(indent)
end end
return tree tree
end end
end end
@ -108,13 +107,21 @@ module HTML5lib
for child in childNodes for child in childNodes
tree += child.printTree(indent + 2) tree += child.printTree(indent + 2)
end end
return tree tree
end end
end end
class DocumentType < Node class DocumentType < Node
attr_accessor :public_id, :system_id
def to_s def to_s
"<!DOCTYPE %s>" % name "<!DOCTYPE #{name}>"
end
def initialize name
super name
@public_id = nil
@system_id = nil
end end
end end
@ -164,12 +171,12 @@ module HTML5lib
end end
def testSerializer node def testSerializer node
node.printTree() node.printTree
end end
def getFragment def get_fragment
@document = super @document = super
return @document.childNodes @document.childNodes
end end
end end

View file

@ -0,0 +1,26 @@
require 'html5/treewalkers/base'
module HTML5
module TreeWalkers
class << self
def [](name)
case name.to_s.downcase
when 'simpletree'
require 'html5/treewalkers/simpletree'
SimpleTree::TreeWalker
when 'rexml'
require 'html5/treewalkers/rexml'
REXML::TreeWalker
when 'hpricot'
require 'html5/treewalkers/hpricot'
Hpricot::TreeWalker
else
raise "Unknown TreeWalker #{name}"
end
end
alias :get_tree_walker :[]
end
end
end

View file

@ -0,0 +1,154 @@
require 'html5/constants'
module HTML5
module TreeWalkers
module TokenConstructor
def error(msg)
{:type => "SerializeError", :data => msg}
end
def normalize_attrs(attrs)
attrs.to_a
end
def empty_tag(name, attrs, has_children=false)
error(_("Void element has children")) if has_children
{:type => :EmptyTag, :name => name, :data => normalize_attrs(attrs)}
end
def start_tag(name, attrs)
{:type => :StartTag, :name => name, :data => normalize_attrs(attrs)}
end
def end_tag(name)
{:type => :EndTag, :name => name, :data => []}
end
def text(data)
if data =~ /\A([#{SPACE_CHARACTERS.join('')}]+)/m
yield({:type => :SpaceCharacters, :data => $1})
data = data[$1.length .. -1]
return if data.empty?
end
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)\Z/m
yield({:type => :Characters, :data => data[0 ... -$1.length]})
yield({:type => :SpaceCharacters, :data => $1})
else
yield({:type => :Characters, :data => data})
end
end
def comment(data)
{:type => :Comment, :data => data}
end
def doctype(name, public_id, system_id, correct=nil)
{:type => :Doctype, :name => name, :public_id => public_id, :system_id => system_id, :correct => correct}
end
def unknown(nodeType)
error(_("Unknown node type: ") + nodeType.to_s)
end
def _(str)
str
end
end
class Base
include TokenConstructor
def initialize(tree)
@tree = tree
end
def each
raise NotImplementedError
end
alias walk each
end
class NonRecursiveTreeWalker < TreeWalkers::Base
def node_details(node)
raise NotImplementedError
end
def first_child(node)
raise NotImplementedError
end
def next_sibling(node)
raise NotImplementedError
end
def parent(node)
raise NotImplementedError
end
def each
current_node = @tree
while current_node != nil
details = node_details(current_node)
has_children = false
case details.shift
when :DOCTYPE
yield doctype(*details)
when :TEXT
text(*details) {|token| yield token}
when :ELEMENT
name, attributes, has_children = details
if VOID_ELEMENTS.include?(name)
yield empty_tag(name, attributes.to_a, has_children)
has_children = false
else
yield start_tag(name, attributes.to_a)
end
when :COMMENT
yield comment(details[0])
when :DOCUMENT, :DOCUMENT_FRAGMENT
has_children = true
when nil
# ignore (REXML::XMLDecl is an example)
else
yield unknown(details[0])
end
first_child = has_children ? first_child(current_node) : nil
if first_child != nil
current_node = first_child
else
while current_node != nil
details = node_details(current_node)
if details.shift == :ELEMENT
name, attributes, has_children = details
yield end_tag(name) if !VOID_ELEMENTS.include?(name)
end
if @tree == current_node
current_node = nil
else
next_sibling = next_sibling(current_node)
if next_sibling != nil
current_node = next_sibling
break
end
current_node = parent(current_node)
end
end
end
end
end
end
end
end

View file

@ -1,10 +1,10 @@
require 'html5lib/treewalkers/base' require 'html5/treewalkers/base'
require 'rexml/document' require 'rexml/document'
module HTML5lib module HTML5
module TreeWalkers module TreeWalkers
module Hpricot module Hpricot
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
def node_details(node) def node_details(node)
case node case node
@ -13,17 +13,17 @@ module HTML5lib
[:DOCUMENT_FRAGMENT] [:DOCUMENT_FRAGMENT]
else else
[:ELEMENT, node.name, [:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value]}, node.attributes.map {|name, value| [name, value]},
!node.empty?] !node.empty?]
end end
when ::Hpricot::Text when ::Hpricot::Text
[:TEXT, node.to_plain_text] [:TEXT, node.content]
when ::Hpricot::Comment when ::Hpricot::Comment
[:COMMENT, node.content] [:COMMENT, node.content]
when ::Hpricot::Doc when ::Hpricot::Doc
[:DOCUMENT] [:DOCUMENT]
when ::Hpricot::DocType when ::Hpricot::DocType
[:DOCTYPE, node.target] [:DOCTYPE, node.target, node.public_id, node.system_id]
when ::Hpricot::XMLDecl when ::Hpricot::XMLDecl
[nil] [nil]
else else

View file

@ -1,10 +1,10 @@
require 'html5lib/treewalkers/base' require 'html5/treewalkers/base'
require 'rexml/document' require 'rexml/document'
module HTML5lib module HTML5
module TreeWalkers module TreeWalkers
module REXML module REXML
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
def node_details(node) def node_details(node)
case node case node
@ -23,7 +23,7 @@ module HTML5lib
when ::REXML::Comment when ::REXML::Comment
[:COMMENT, node.string] [:COMMENT, node.string]
when ::REXML::DocType when ::REXML::DocType
[:DOCTYPE, node.name] [:DOCTYPE, node.name, node.public, node.system]
when ::REXML::XMLDecl when ::REXML::XMLDecl
[nil] [nil]
else else

View file

@ -1,10 +1,10 @@
require 'html5lib/treewalkers/base' require 'html5/treewalkers/base'
module HTML5lib module HTML5
module TreeWalkers module TreeWalkers
module SimpleTree module SimpleTree
class TreeWalker < HTML5lib::TreeWalkers::Base class TreeWalker < HTML5::TreeWalkers::Base
include HTML5lib::TreeBuilders::SimpleTree include HTML5::TreeBuilders::SimpleTree
def walk(node) def walk(node)
case node case node
@ -12,20 +12,20 @@ module HTML5lib
return return
when DocumentType when DocumentType
yield doctype(node.name) yield doctype(node.name, node.public_id, node.system_id)
when TextNode when TextNode
text(node.value) {|token| yield token} text(node.value) {|token| yield token}
when Element when Element
if VOID_ELEMENTS.include?(node.name) if VOID_ELEMENTS.include?(node.name)
yield emptyTag(node.name, node.attributes, node.hasContent()) yield empty_tag(node.name, node.attributes, node.hasContent())
else else
yield startTag(node.name, node.attributes) yield start_tag(node.name, node.attributes)
for child in node.childNodes for child in node.childNodes
walk(child) {|token| yield token} walk(child) {|token| yield token}
end end
yield endTag(node.name) yield end_tag(node.name)
end end
when CommentNode when CommentNode

View file

@ -0,0 +1,3 @@
module HTML5
VERSION = '0.1.0'
end

View file

@ -1,11 +0,0 @@
require 'html5lib/html5parser'
module HTML5lib
def self.parse(stream, options={})
HTMLParser.parse(stream, options)
end
def self.parseFragment(stream, options={})
HTMLParser.parse(stream, options)
end
end

View file

@ -1,708 +0,0 @@
module HTML5lib
class EOF < Exception; end
CONTENT_MODEL_FLAGS = [
:PCDATA,
:RCDATA,
:CDATA,
:PLAINTEXT
]
SCOPING_ELEMENTS = %w[
button
caption
html
marquee
object
table
td
th
]
FORMATTING_ELEMENTS = %w[
a
b
big
em
font
i
nobr
s
small
strike
strong
tt
u
]
SPECIAL_ELEMENTS = %w[
address
area
base
basefont
bgsound
blockquote
body
br
center
col
colgroup
dd
dir
div
dl
dt
embed
fieldset
form
frame
frameset
h1
h2
h3
h4
h5
h6
head
hr
iframe
image
img
input
isindex
li
link
listing
menu
meta
noembed
noframes
noscript
ol
optgroup
option
p
param
plaintext
pre
script
select
spacer
style
tbody
textarea
tfoot
thead
title
tr
ul
wbr
]
SPACE_CHARACTERS = %W[
\t
\n
\x0B
\x0C
\x20
\r
]
TABLE_INSERT_MODE_ELEMENTS = %w[
table
tbody
tfoot
thead
tr
]
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
DIGITS = '0'..'9'
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
# Heading elements need to be ordered
HEADING_ELEMENTS = %w[
h1
h2
h3
h4
h5
h6
]
# XXX What about event-source and command?
VOID_ELEMENTS = %w[
base
link
meta
hr
br
img
embed
param
area
col
input
]
CDATA_ELEMENTS = %w[title textarea]
RCDATA_ELEMENTS = %w[
style
script
xmp
iframe
noembed
noframes
noscript
]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],
'img' => %w[ismap],
'audio' => %w[autoplay controls],
'video' => %w[autoplay controls],
'script' => %w[defer async],
'details' => %w[open],
'datagrid' => %w[multiple disabled],
'command' => %w[hidden disabled checked default],
'menu' => %w[autosubmit],
'fieldset' => %w[disabled readonly],
'option' => %w[disabled readonly selected],
'optgroup' => %w[disabled readonly],
'button' => %w[disabled autofocus],
'input' => %w[disabled readonly required autofocus checked ismap],
'select' => %w[disabled readonly autofocus multiple],
'output' => %w[disabled readonly]
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN
65533, # 0x81 UNDEFINED
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
8224, # 0x86 0x2020 DAGGER
8225, # 0x87 0x2021 DOUBLE DAGGER
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
8240, # 0x89 0x2030 PER MILLE SIGN
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
65533, # 0x8D UNDEFINED
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
65533, # 0x8F UNDEFINED
65533, # 0x90 UNDEFINED
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
8226, # 0x95 0x2022 BULLET
8211, # 0x96 0x2013 EN DASH
8212, # 0x97 0x2014 EM DASH
732, # 0x98 0x02DC SMALL TILDE
8482, # 0x99 0x2122 TRADE MARK SIGN
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
65533, # 0x9D UNDEFINED
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
]
private
def self.U n
[n].pack('U')
end
public
ENTITIES = {
"AElig" => U(0xC6),
"Aacute" => U(0xC1),
"Acirc" => U(0xC2),
"Agrave" => U(0xC0),
"Alpha" => U(0x0391),
"Aring" => U(0xC5),
"Atilde" => U(0xC3),
"Auml" => U(0xC4),
"Beta" => U(0x0392),
"Ccedil" => U(0xC7),
"Chi" => U(0x03A7),
"Dagger" => U(0x2021),
"Delta" => U(0x0394),
"ETH" => U(0xD0),
"Eacute" => U(0xC9),
"Ecirc" => U(0xCA),
"Egrave" => U(0xC8),
"Epsilon" => U(0x0395),
"Eta" => U(0x0397),
"Euml" => U(0xCB),
"Gamma" => U(0x0393),
"Iacute" => U(0xCD),
"Icirc" => U(0xCE),
"Igrave" => U(0xCC),
"Iota" => U(0x0399),
"Iuml" => U(0xCF),
"Kappa" => U(0x039A),
"Lambda" => U(0x039B),
"Mu" => U(0x039C),
"Ntilde" => U(0xD1),
"Nu" => U(0x039D),
"OElig" => U(0x0152),
"Oacute" => U(0xD3),
"Ocirc" => U(0xD4),
"Ograve" => U(0xD2),
"Omega" => U(0x03A9),
"Omicron" => U(0x039F),
"Oslash" => U(0xD8),
"Otilde" => U(0xD5),
"Ouml" => U(0xD6),
"Phi" => U(0x03A6),
"Pi" => U(0x03A0),
"Prime" => U(0x2033),
"Psi" => U(0x03A8),
"Rho" => U(0x03A1),
"Scaron" => U(0x0160),
"Sigma" => U(0x03A3),
"THORN" => U(0xDE),
"Tau" => U(0x03A4),
"Theta" => U(0x0398),
"Uacute" => U(0xDA),
"Ucirc" => U(0xDB),
"Ugrave" => U(0xD9),
"Upsilon" => U(0x03A5),
"Uuml" => U(0xDC),
"Xi" => U(0x039E),
"Yacute" => U(0xDD),
"Yuml" => U(0x0178),
"Zeta" => U(0x0396),
"aacute" => U(0xE1),
"acirc" => U(0xE2),
"acute" => U(0xB4),
"aelig" => U(0xE6),
"agrave" => U(0xE0),
"alefsym" => U(0x2135),
"alpha" => U(0x03B1),
"amp" => U(0x26),
"AMP" => U(0x26),
"and" => U(0x2227),
"ang" => U(0x2220),
"apos" => U(0x27),
"aring" => U(0xE5),
"asymp" => U(0x2248),
"atilde" => U(0xE3),
"auml" => U(0xE4),
"bdquo" => U(0x201E),
"beta" => U(0x03B2),
"brvbar" => U(0xA6),
"bull" => U(0x2022),
"cap" => U(0x2229),
"ccedil" => U(0xE7),
"cedil" => U(0xB8),
"cent" => U(0xA2),
"chi" => U(0x03C7),
"circ" => U(0x02C6),
"clubs" => U(0x2663),
"cong" => U(0x2245),
"copy" => U(0xA9),
"COPY" => U(0xA9),
"crarr" => U(0x21B5),
"cup" => U(0x222A),
"curren" => U(0xA4),
"dArr" => U(0x21D3),
"dagger" => U(0x2020),
"darr" => U(0x2193),
"deg" => U(0xB0),
"delta" => U(0x03B4),
"diams" => U(0x2666),
"divide" => U(0xF7),
"eacute" => U(0xE9),
"ecirc" => U(0xEA),
"egrave" => U(0xE8),
"empty" => U(0x2205),
"emsp" => U(0x2003),
"ensp" => U(0x2002),
"epsilon" => U(0x03B5),
"equiv" => U(0x2261),
"eta" => U(0x03B7),
"eth" => U(0xF0),
"euml" => U(0xEB),
"euro" => U(0x20AC),
"exist" => U(0x2203),
"fnof" => U(0x0192),
"forall" => U(0x2200),
"frac12" => U(0xBD),
"frac14" => U(0xBC),
"frac34" => U(0xBE),
"frasl" => U(0x2044),
"gamma" => U(0x03B3),
"ge" => U(0x2265),
"gt" => U(0x3E),
"GT" => U(0x3E),
"hArr" => U(0x21D4),
"harr" => U(0x2194),
"hearts" => U(0x2665),
"hellip" => U(0x2026),
"iacute" => U(0xED),
"icirc" => U(0xEE),
"iexcl" => U(0xA1),
"igrave" => U(0xEC),
"image" => U(0x2111),
"infin" => U(0x221E),
"int" => U(0x222B),
"iota" => U(0x03B9),
"iquest" => U(0xBF),
"isin" => U(0x2208),
"iuml" => U(0xEF),
"kappa" => U(0x03BA),
"lArr" => U(0x21D0),
"lambda" => U(0x03BB),
"lang" => U(0x2329),
"laquo" => U(0xAB),
"larr" => U(0x2190),
"lceil" => U(0x2308),
"ldquo" => U(0x201C),
"le" => U(0x2264),
"lfloor" => U(0x230A),
"lowast" => U(0x2217),
"loz" => U(0x25CA),
"lrm" => U(0x200E),
"lsaquo" => U(0x2039),
"lsquo" => U(0x2018),
"lt" => U(0x3C),
"LT" => U(0x3C),
"macr" => U(0xAF),
"mdash" => U(0x2014),
"micro" => U(0xB5),
"middot" => U(0xB7),
"minus" => U(0x2212),
"mu" => U(0x03BC),
"nabla" => U(0x2207),
"nbsp" => U(0xA0),
"ndash" => U(0x2013),
"ne" => U(0x2260),
"ni" => U(0x220B),
"not" => U(0xAC),
"notin" => U(0x2209),
"nsub" => U(0x2284),
"ntilde" => U(0xF1),
"nu" => U(0x03BD),
"oacute" => U(0xF3),
"ocirc" => U(0xF4),
"oelig" => U(0x0153),
"ograve" => U(0xF2),
"oline" => U(0x203E),
"omega" => U(0x03C9),
"omicron" => U(0x03BF),
"oplus" => U(0x2295),
"or" => U(0x2228),
"ordf" => U(0xAA),
"ordm" => U(0xBA),
"oslash" => U(0xF8),
"otilde" => U(0xF5),
"otimes" => U(0x2297),
"ouml" => U(0xF6),
"para" => U(0xB6),
"part" => U(0x2202),
"permil" => U(0x2030),
"perp" => U(0x22A5),
"phi" => U(0x03C6),
"pi" => U(0x03C0),
"piv" => U(0x03D6),
"plusmn" => U(0xB1),
"pound" => U(0xA3),
"prime" => U(0x2032),
"prod" => U(0x220F),
"prop" => U(0x221D),
"psi" => U(0x03C8),
"quot" => U(0x22),
"QUOT" => U(0x22),
"rArr" => U(0x21D2),
"radic" => U(0x221A),
"rang" => U(0x232A),
"raquo" => U(0xBB),
"rarr" => U(0x2192),
"rceil" => U(0x2309),
"rdquo" => U(0x201D),
"real" => U(0x211C),
"reg" => U(0xAE),
"REG" => U(0xAE),
"rfloor" => U(0x230B),
"rho" => U(0x03C1),
"rlm" => U(0x200F),
"rsaquo" => U(0x203A),
"rsquo" => U(0x2019),
"sbquo" => U(0x201A),
"scaron" => U(0x0161),
"sdot" => U(0x22C5),
"sect" => U(0xA7),
"shy" => U(0xAD),
"sigma" => U(0x03C3),
"sigmaf" => U(0x03C2),
"sim" => U(0x223C),
"spades" => U(0x2660),
"sub" => U(0x2282),
"sube" => U(0x2286),
"sum" => U(0x2211),
"sup" => U(0x2283),
"sup1" => U(0xB9),
"sup2" => U(0xB2),
"sup3" => U(0xB3),
"supe" => U(0x2287),
"szlig" => U(0xDF),
"tau" => U(0x03C4),
"there4" => U(0x2234),
"theta" => U(0x03B8),
"thetasym" => U(0x03D1),
"thinsp" => U(0x2009),
"thorn" => U(0xFE),
"tilde" => U(0x02DC),
"times" => U(0xD7),
"trade" => U(0x2122),
"uArr" => U(0x21D1),
"uacute" => U(0xFA),
"uarr" => U(0x2191),
"ucirc" => U(0xFB),
"ugrave" => U(0xF9),
"uml" => U(0xA8),
"upsih" => U(0x03D2),
"upsilon" => U(0x03C5),
"uuml" => U(0xFC),
"weierp" => U(0x2118),
"xi" => U(0x03BE),
"yacute" => U(0xFD),
"yen" => U(0xA5),
"yuml" => U(0xFF),
"zeta" => U(0x03B6),
"zwj" => U(0x200D),
"zwnj" => U(0x200C)
}
ENCODINGS = %w[
ansi_x3.4-1968
iso-ir-6
ansi_x3.4-1986
iso_646.irv:1991
ascii
iso646-us
us-ascii
us
ibm367
cp367
csascii
ks_c_5601-1987
korean
iso-2022-kr
csiso2022kr
euc-kr
iso-2022-jp
csiso2022jp
iso-2022-jp-2
iso-ir-58
chinese
csiso58gb231280
iso_8859-1:1987
iso-ir-100
iso_8859-1
iso-8859-1
latin1
l1
ibm819
cp819
csisolatin1
iso_8859-2:1987
iso-ir-101
iso_8859-2
iso-8859-2
latin2
l2
csisolatin2
iso_8859-3:1988
iso-ir-109
iso_8859-3
iso-8859-3
latin3
l3
csisolatin3
iso_8859-4:1988
iso-ir-110
iso_8859-4
iso-8859-4
latin4
l4
csisolatin4
iso_8859-6:1987
iso-ir-127
iso_8859-6
iso-8859-6
ecma-114
asmo-708
arabic
csisolatinarabic
iso_8859-7:1987
iso-ir-126
iso_8859-7
iso-8859-7
elot_928
ecma-118
greek
greek8
csisolatingreek
iso_8859-8:1988
iso-ir-138
iso_8859-8
iso-8859-8
hebrew
csisolatinhebrew
iso_8859-5:1988
iso-ir-144
iso_8859-5
iso-8859-5
cyrillic
csisolatincyrillic
iso_8859-9:1989
iso-ir-148
iso_8859-9
iso-8859-9
latin5
l5
csisolatin5
iso-8859-10
iso-ir-157
l6
iso_8859-10:1992
csisolatin6
latin6
hp-roman8
roman8
r8
ibm037
cp037
csibm037
ibm424
cp424
csibm424
ibm437
cp437
437
cspc8codepage437
ibm500
cp500
csibm500
ibm775
cp775
cspc775baltic
ibm850
cp850
850
cspc850multilingual
ibm852
cp852
852
cspcp852
ibm855
cp855
855
csibm855
ibm857
cp857
857
csibm857
ibm860
cp860
860
csibm860
ibm861
cp861
861
cp-is
csibm861
ibm862
cp862
862
cspc862latinhebrew
ibm863
cp863
863
csibm863
ibm864
cp864
csibm864
ibm865
cp865
865
csibm865
ibm866
cp866
866
csibm866
ibm869
cp869
869
cp-gr
csibm869
ibm1026
cp1026
csibm1026
koi8-r
cskoi8r
koi8-u
big5-hkscs
ptcp154
csptcp154
pt154
cp154
utf-7
utf-16be
utf-16le
utf-16
utf-8
iso-8859-13
iso-8859-14
iso-ir-199
iso_8859-14:1998
iso_8859-14
latin8
iso-celtic
l8
iso-8859-15
iso_8859-15
iso-8859-16
iso-ir-226
iso_8859-16:2001
iso_8859-16
latin10
l10
gbk
cp936
ms936
gb18030
shift_jis
ms_kanji
csshiftjis
euc-jp
gb2312
big5
csbig5
windows-1250
windows-1251
windows-1252
windows-1253
windows-1254
windows-1255
windows-1256
windows-1257
windows-1258
tis-620
hz-gb-2312
]
end

View file

@ -1 +0,0 @@
require 'html5lib/filters/optionaltags'

View file

@ -1,57 +0,0 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
handle_start 'html', 'frameset', 'frame', 'noframes'
handle_end 'frameset', 'noframes'
def processCharacters(data)
@parser.parseError(_('Unexpected characters in the frameset phase. Characters ignored.'))
end
def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes)
end
def startTagFrame(name, attributes)
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the frameset phase. Ignored"))
end
def endTagFrameset(name)
if @tree.openElements[-1].name == 'html'
# innerHTML case
@parser.parseError(_("Unexpected end tag token (frameset) in the frameset phase (innerHTML)."))
else
@tree.openElements.pop
end
if (not @parser.innerHTML and
@tree.openElements[-1].name != 'frameset')
# If we're not in innerHTML mode and the the current node is not a
# "frameset" element (anymore) then switch.
@parser.phase = @parser.phases[:afterFrameset]
end
end
def endTagNoframes(name)
@parser.phases[:inBody].processEndTag(name)
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the frameset phase. Ignored."))
end
end
end

View file

@ -1,126 +0,0 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InHeadPhase < Phase
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
handle_end 'head'
handle_end %w( html body br ) => 'ImplyAfterHead'
handle_end %w( title style script )
def processEOF
if ['title', 'style', 'script'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected end of file. Expected end tag (#{name})."))
@tree.openElements.pop
end
anythingElse
@parser.phase.processEOF
end
def processCharacters(data)
if ['title', 'style', 'script'].include?(@tree.openElements[-1].name)
@tree.insertText(data)
else
anythingElse
@parser.phase.processCharacters(data)
end
end
def startTagHead(name, attributes)
@parser.parseError(_('Unexpected start tag head in existing head. Ignored'))
end
def startTagTitle(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :RCDATA
end
def startTagStyle(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagScript(name, attributes)
#XXX Inner HTML case may be wrong
element = @tree.createElement(name, attributes)
element._flags.push("parser-inserted")
if (@tree.headPointer != nil and
@parser.phase == @parser.phases[:inHead])
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagBaseLinkMeta(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
end
def startTagOther(name, attributes)
anythingElse
@parser.phase.processStartTag(name, attributes)
end
def endTagHead(name)
if @tree.openElements[-1].name == 'head'
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (head). Ignored."))
end
@parser.phase = @parser.phases[:afterHead]
end
def endTagImplyAfterHead(name)
anythingElse
@parser.phase.processEndTag(name)
end
def endTagTitleStyleScript(name)
if @tree.openElements[-1].name == name
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def anythingElse
if @tree.openElements[-1].name == 'head'
endTagHead('head')
else
@parser.phase = @parser.phases[:afterHead]
end
end
protected
def appendToHead(element)
if @tree.headPointer.nil?
assert @parser.innerHTML
@tree.openElements[-1].appendChild(element)
else
@tree.headPointer.appendChild(element)
end
end
end
end

View file

@ -1,84 +0,0 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InSelectPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
handle_start 'html', 'option', 'optgroup', 'select'
handle_end 'option', 'optgroup', 'select', %w( caption table tbody tfoot thead tr td th ) => 'TableElements'
def processCharacters(data)
@tree.insertText(data)
end
def startTagOption(name, attributes)
# We need to imply </option> if <option> is the current node.
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.insertElement(name, attributes)
end
def startTagOptgroup(name, attributes)
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.openElements.pop if @tree.openElements[-1].name == 'optgroup'
@tree.insertElement(name, attributes)
end
def startTagSelect(name, attributes)
@parser.parseError(_('Unexpected start tag (select) in the select phase implies select start tag.'))
endTagSelect('select')
end
def startTagOther(name, attributes)
@parser.parseError(_('Unexpected start tag token (#{name}) in the select phase. Ignored.'))
end
def endTagOption(name)
if @tree.openElements[-1].name == 'option'
@tree.openElements.pop
else
@parser.parseError(_('Unexpected end tag (option) in the select phase. Ignored.'))
end
end
def endTagOptgroup(name)
# </optgroup> implicitly closes <option>
if @tree.openElements[-1].name == 'option' and @tree.openElements[-2].name == 'optgroup'
@tree.openElements.pop
end
# It also closes </optgroup>
if @tree.openElements[-1].name == 'optgroup'
@tree.openElements.pop
# But nothing else
else
@parser.parseError(_('Unexpected end tag (optgroup) in the select phase. Ignored.'))
end
end
def endTagSelect(name)
if in_scope?('select', true)
remove_open_elements_until('select')
@parser.resetInsertionMode
else
# innerHTML case
@parser.parseError
end
end
def endTagTableElements(name)
@parser.parseError(_("Unexpected table end tag (#{name}) in the select phase."))
if in_scope?(name, true)
endTagSelect('select')
@parser.phase.processEndTag(name)
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the select phase. Ignored."))
end
end
end

View file

@ -1,36 +0,0 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class TrailingEndPhase < Phase
def processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processSpaceCharacters(data)
@parser.lastPhase.processSpaceCharacters(data)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_('Unexpected start tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
@parser.parseError(_('Unexpected end tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processEndTag(name)
end
end
end

View file

@ -1,2 +0,0 @@
require 'html5lib/serializer/htmlserializer'
require 'html5lib/serializer/xhtmlserializer'

View file

@ -1,19 +0,0 @@
require 'html5lib/serializer/htmlserializer'
module HTML5lib
class XHTMLSerializer < HTMLSerializer
DEFAULTS = {
:quote_attr_values => true,
:minimize_boolean_attributes => false,
:use_trailing_solidus => true,
:escape_lt_in_attrs => true,
:omit_optional_tags => false
}
def initialize(options={})
super(DEFAULTS.clone.update(options))
end
end
end

File diff suppressed because it is too large Load diff

View file

@ -1,26 +0,0 @@
require 'html5lib/treewalkers/base'
module HTML5lib
module TreeWalkers
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treewalkers/simpletree'
SimpleTree::TreeWalker
when 'rexml' then
require 'html5lib/treewalkers/rexml'
REXML::TreeWalker
when 'hpricot' then
require 'html5lib/treewalkers/hpricot'
Hpricot::TreeWalker
else
raise "Unknown TreeWalker #{name}"
end
end
alias :getTreeWalker :[]
end
end
end

View file

@ -1,156 +0,0 @@
require 'html5lib/constants'
module HTML5lib
module TreeWalkers
module TokenConstructor
def error(msg)
return {:type => "SerializeError", :data => msg}
end
def normalizeAttrs(attrs)
attrs.to_a
end
def emptyTag(name, attrs, hasChildren=false)
error(_("Void element has children")) if hasChildren
return({:type => :EmptyTag, :name => name, \
:data => normalizeAttrs(attrs)})
end
def startTag(name, attrs)
return {:type => :StartTag, :name => name, \
:data => normalizeAttrs(attrs)}
end
def endTag(name)
return {:type => :EndTag, :name => name, :data => []}
end
def text(data)
if data =~ /\A([#{SPACE_CHARACTERS.join('')}]+)/m
yield({:type => :SpaceCharacters, :data => $1})
data = data[$1.length .. -1]
return if data.empty?
end
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)\Z/m
yield({:type => :Characters, :data => data[0 ... -$1.length]})
yield({:type => :SpaceCharacters, :data => $1})
else
yield({:type => :Characters, :data => data})
end
end
def comment(data)
return {:type => :Comment, :data => data}
end
def doctype(name)
return {:type => :Doctype, :name => name, :data => name.upcase() == "HTML"}
end
def unknown(nodeType)
return error(_("Unknown node type: ") + nodeType.to_s)
end
def _(str)
str
end
end
class Base
include TokenConstructor
def initialize(tree)
@tree = tree
end
def each
raise NotImplementedError
end
alias walk each
end
class NonRecursiveTreeWalker < TreeWalkers::Base
def node_details(node)
raise NotImplementedError
end
def first_child(node)
raise NotImplementedError
end
def next_sibling(node)
raise NotImplementedError
end
def parent(node)
raise NotImplementedError
end
def each
currentNode = @tree
while currentNode != nil
details = node_details(currentNode)
hasChildren = false
case details.shift
when :DOCTYPE
yield doctype(*details)
when :TEXT
text(*details) {|token| yield token}
when :ELEMENT
name, attributes, hasChildren = details
if VOID_ELEMENTS.include?(name)
yield emptyTag(name, attributes.to_a, hasChildren)
hasChildren = false
else
yield startTag(name, attributes.to_a)
end
when :COMMENT
yield comment(details[0])
when :DOCUMENT, :DOCUMENT_FRAGMENT
hasChildren = true
when nil
# ignore (REXML::XMLDecl is an example)
else
yield unknown(details[0])
end
firstChild = hasChildren ? first_child(currentNode) : nil
if firstChild != nil
currentNode = firstChild
else
while currentNode != nil
details = node_details(currentNode)
if details.shift == :ELEMENT
name, attributes, hasChildren = details
yield endTag(name) if !VOID_ELEMENTS.include?(name)
end
if @tree == currentNode
currentNode = nil
else
nextSibling = next_sibling(currentNode)
if nextSibling != nil
currentNode = nextSibling
break
end
currentNode = parent(currentNode)
end
end
end
end
end
end
end
end

View file

@ -26,15 +26,15 @@ def parse(opts, args)
exit(1) exit(1)
end end
require 'html5lib/treebuilders' require 'html5/treebuilders'
treebuilder = HTML5lib::TreeBuilders[opts.treebuilder] treebuilder = HTML5::TreeBuilders[opts.treebuilder]
if opts.output == :xml if opts.output == :xml
require 'html5lib/liberalxmlparser' require 'html5/liberalxmlparser'
p = HTML5lib::XHTMLParser.new(:tree=>treebuilder) p = HTML5::XHTMLParser.new(:tree=>treebuilder)
else else
require 'html5lib/html5parser' require 'html5/html5parser'
p = HTML5lib::HTMLParser.new(:tree=>treebuilder) p = HTML5::HTMLParser.new(:tree=>treebuilder)
end end
if opts.parsemethod == :parse if opts.parsemethod == :parse
@ -70,10 +70,10 @@ def printOutput(parser, document, opts)
when :xml when :xml
print document print document
when :html when :html
require 'html5lib/treewalkers' require 'html5/treewalkers'
tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document) tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
require 'html5lib/serializer' require 'html5/serializer'
puts HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer) puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
when :hilite when :hilite
print document.hilite print document.hilite
when :tree when :tree
@ -188,6 +188,10 @@ opts = OptionParser.new do |opts|
options.serializer[:escape_lt_in_attrs] = lt options.serializer[:escape_lt_in_attrs] = lt
end end
opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
options.serializer[:escape_rcdata] = rcdata
end
opts.separator "" opts.separator ""
opts.separator "Other Options:" opts.separator "Other Options:"

View file

@ -33,7 +33,6 @@ EUC-jp
#encoding #encoding
EUC-jp EUC-jp
#data #data
<!-- --> <!-- -->
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> <meta http-equiv="Content-Type" content="text/html; charset=utf-8">

View file

@ -92,7 +92,8 @@
{"description": "rcdata", {"description": "rcdata",
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]], "input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a<b>c&d"] "expected": ["<script>a<b>c&d"],
"xhtml": ["<script>a&lt;b&gt;c&amp;d"]
}, },
{"description": "doctype", {"description": "doctype",

View file

@ -49,6 +49,12 @@
"options": {"escape_lt_in_attrs": true}, "options": {"escape_lt_in_attrs": true},
"input": [["StartTag", "a", {"title": "a<b>c&d"}]], "input": [["StartTag", "a", {"title": "a<b>c&d"}]],
"expected": ["<a title=\"a&lt;b>c&amp;d\">"] "expected": ["<a title=\"a&lt;b>c&amp;d\">"]
},
{"description": "rcdata",
"options": {"escape_rcdata": true},
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a&lt;b&gt;c&amp;d"]
} }
]} ]}

View file

@ -3,13 +3,13 @@
{"description": "bare text with leading spaces", {"description": "bare text with leading spaces",
"options": {"strip_whitespace": true}, "options": {"strip_whitespace": true},
"input": [["Characters", "\t\r\n\u000B\u000C foo"]], "input": [["Characters", "\t\r\n\u000B\u000C foo"]],
"expected": ["foo"] "expected": [" foo"]
}, },
{"description": "bare text with trailing spaces", {"description": "bare text with trailing spaces",
"options": {"strip_whitespace": true}, "options": {"strip_whitespace": true},
"input": [["Characters", "foo \t\r\n\u000B\u000C"]], "input": [["Characters", "foo \t\r\n\u000B\u000C"]],
"expected": ["foo"] "expected": ["foo "]
}, },
{"description": "bare text with inner spaces", {"description": "bare text with inner spaces",

View file

@ -0,0 +1,43 @@
[
{"type": "text/html", "input": ""},
{"type": "text/html", "input": "<!---->"},
{"type": "text/html", "input": "<!--asdfaslkjdf;laksjdf as;dkfjsd-->"},
{"type": "text/html", "input": "<!"},
{"type": "text/html", "input": "\t"},
{"type": "text/html", "input": "<!>"},
{"type": "text/html", "input": "<?"},
{"type": "text/html", "input": "<??>"},
{"type": "application/rss+xml", "input": "<rss"},
{"type": "application/atom+xml", "input": "<feed"},
{"type": "text/html", "input": "<html"},
{"type": "text/html", "input": "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n<html><head>\n<title>302 Found</title>\n</head><body>\n<h1>Found</h1>\n<p>The document has moved <a href=\"http://feeds.feedburner.com/gofug\">here</a>.</p>\n</body></html>\n"},
{"type": "text/html", "input": "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">\r\n<HTML><HEAD>\r\n <link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/289619328/feed.css\" /><link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/431602649/feed.css\" />\r\n<link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/382549546/feed.css\" />\r\n<link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/314618017/feed.css\" /><META http-equiv=\"expires\" content="},
{"type": "text/html", "input": "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\r\n<html>\r\n<head>\r\n<title>Xiaxue - Chicken pie blogger.</title><meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\"><style type=\"text/css\">\r\n<style type=\"text/css\">\r\n<!--\r\nbody {\r\n background-color: #FFF2F2;\r\n}\r\n.style1 {font-family: Georgia, \"Times New Roman\", Times, serif}\r\n.style2 {\r\n color: #8a567c;\r\n font-size: 14px;\r\n font-family: Georgia, \"Times New Roman\", Times, serif;\r\n}\r"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<head> \r\n<title>Google Operating System</title>\r\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"Description\" content=\"Unofficial news and tips about Google. A blog that watches Google's latest developments and the attempts to move your operating system online.\" />\r\n<meta name=\"generator\" c"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<head>\r\n <title>Assimilated Press</title> <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"MSSmartTagsPreventParsing\" content=\"true\" />\r\n<meta name=\"generator\" content=\"Blogger\" />\r\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"Assimilated Press - Atom\" href=\"http://assimila"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<head>\r\n <title>PostSecret</title>\r\n<META name=\"keywords\" Content=\"secrets, postcard, secret, postcards, postsecret, postsecrets,online confessional, post secret, post secrets, artomatic, post a secret\"><META name=\"discription\" Content=\"See a Secret...Share a Secret\"> <meta http-equiv=\"Content-Type\" content=\"te"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns='http://www.w3.org/1999/xhtml' xmlns:b='http://www.google.com/2005/gml/b' xmlns:data='http://www.google.com/2005/gml/data' xmlns:expr='http://www.google.com/2005/gml/expr'>\n <head>\n \n <meta content='text/html; charset=UTF-8' http-equiv='Content-Type'/>\n <meta content='true' name='MSSmartTagsPreventParsing'/>\n <meta content='blogger' name='generator'/>\n <link rel=\"alternate\" typ"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"ja\">\n<head profile=\"http://gmpg.org/xfn/11\"> \n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" /> \n<title> CMS Lever</title><link rel=\"stylesheet\" type=\"text/css\" media=\"screen\" href=\"http://s.wordpress.com/wp-content/themes/pub/twenty-eight/2813.css\"/>\n<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS 2.0\" h"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"en\"><head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n<title> Park Avenue Peerage</title>\t<meta name=\"generator\" content=\"WordPress.com\" />\t<!-- feeds -->\n\t<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS 2.0\" href=\"http://parkavenuepeerage.wordpress.com/feed/\" />\t<link rel=\"pingback\" href="},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"ja\"><head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n<title> \u884c\u96f2\u6d41\u6c34 -like a floating clouds and running water-</title>\t<meta name=\"generator\" content=\"WordPress.com\" />\t<!-- feeds -->\n\t<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS 2.0\" href=\"http://shw4.wordpress.com/feed/\" />\t<li"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n<title>Go Fug Yourself</title><link rel=\"stylesheet\" href=\"http://gofugyourself.typepad.com/go_fug_yourself/styles.css\" type=\"text/css\" />\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"Atom\" "},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"en\"><head profile=\"http://gmpg.org/xfn/11\">\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" /><title> Ladies&#8230;</title><meta name=\"generator\" content=\"WordPress.com\" /> <!-- leave this for stats --><link rel=\"stylesheet\" href=\"http://s.wordpress.com/wp-content/themes/default/style.css?1\" type=\"tex"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\r\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\r\n<head>\r\n <title>The Sartorialist</title> <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"MSSmartTagsPreventParsing\" content=\"true\" />\r\n<meta name=\"generator\" content=\"Blogger\" />\r\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"The Sartorialist - Atom\" href=\"http://thesartorialist.blogspot"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\" />\n<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n<title>Creating Passionate Users</title><link rel=\"stylesheet\" href=\"http://headrush.typepad.com/creating_passionate_users/styles.css\" type=\"text/css\" />\n<link rel=\"alternate\" type"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" id=\"sixapart-standard\">\n<head>\n\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n\t<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n\t\n\t\n <meta name=\"keywords\" content=\"marketing, blog, seth, ideas, respect, permission\" />\n <meta name=\"description\" content=\"Seth Godin's riffs on marketing, respect, and the "},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" id=\"sixapart-standard\">\n<head>\n\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n\t<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n\t\n\t\n \n <meta name=\"description\" content=\" Western Civilization hangs in the balance. This blog is part of the solution,the cure. Get your heads out of the sand and Fight the G"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"en\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=pahrefhttpwwwfeedburnercomtarget_blankimgsrchttpwwwfeedburnercomfbimagespubpowered_by_fbgifaltPoweredbyFeedBurnerstyleborder0ap\" />\n<title> From Under the Rotunda</title>\n<link rel=\"stylesheet\" href=\"http://s.wordpress.com/wp-content/themes/pub/andreas04/style.css\" type=\"text/css\""},
{"type": "application/atom+xml", "input": "<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href=\"http://www.blogger.com/styles/atom.css\" type=\"text/css\"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/'><id>tag:blogger.com,1999:blog-10861780</id><updated>2007-07-27T12:38:50.888-07:00</updated><title type='text'>Official Google Blog</title><link rel='alternate' type='text/html' href='http://googleblog.blogspot.com/'/><link rel='next' type='application/atom+xml' href='http://googleblog.blogs"},
{"type": "application/rss+xml", "input": "<?xml version='1.0' encoding='UTF-8'?><rss xmlns:atom='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' version='2.0'><channel><atom:id>tag:blogger.com,1999:blog-10861780</atom:id><lastBuildDate>Fri, 27 Jul 2007 19:38:50 +0000</lastBuildDate><title>Official Google Blog</title><description/><link>http://googleblog.blogspot.com/</link><managingEditor>Eric Case</managingEditor><generator>Blogger</generator><openSearch:totalResults>729</openSearch:totalResults><openSearc"},
{"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"pahrefhttpwwwfeedburnercomtarget_blankimgsrchttpwwwfeedburnercomfbimagespubpowered_by_fbgifaltPoweredbyFeedBurnerstyleborder0ap\"?>\n<!-- generator=\"wordpress/MU\" -->\n<rss version=\"2.0\"\n\txmlns:content=\"http://purl.org/rss/1.0/modules/content/\"\n\txmlns:wfw=\"http://wellformedweb.org/CommentAPI/\"\n\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n\t><channel>\n\t<title>From Under the Rotunda</title>\n\t<link>http://dannybernardi.wordpress.com</link>\n\t<description>The Monographs of Danny Ber"},
{"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!-- generator=\"wordpress/MU\" -->\n<rss version=\"2.0\"\n\txmlns:content=\"http://purl.org/rss/1.0/modules/content/\"\n\txmlns:wfw=\"http://wellformedweb.org/CommentAPI/\"\n\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n\t><channel>\n\t<title>CMS Lever</title>\n\t<link>http://kanaguri.wordpress.com</link>\n\t<description>CMS\u306e\u6c17\u306b\u306a\u3063\u305f\u3053\u3068</description>\n\t<pubDate>Wed, 18 Jul 2007 21:26:22 +0000</pubDate>\n\t<generator>http://wordpress.org/?v=MU</generator>\n\t<language>ja</languag"},
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:thr=\"http://purl.org/syndication/thread/1.0\">\n <title>Atlas Shrugs</title>\n <link rel=\"self\" type=\"application/atom+xml\" href=\"http://atlasshrugs2000.typepad.com/atlas_shrugs/atom.xml\" />\n <link rel=\"alternate\" type=\"text/html\" href=\"http://atlasshrugs2000.typepad.com/atlas_shrugs/\" />\n <id>tag:typepad.com,2003:weblog-132946</id>\n <updated>2007-08-15T16:07:34-04"},
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atom10full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:thr=\"http://purl.org/syndication/thread/1.0\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\">\r\n <title>Creating Passionate Users</title>\r\n "},
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atom10full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\">\r\n <title>Seth's Blog</title>\r\n <link rel=\"alternate\" type=\"text/html\" href=\"http://sethgodin.typepad.com/seths_blog/\" />\r\n <link rel=\"s"},
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atom10full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:openSearch=\"http://a9.com/-/spec/opensearchrss/1.0/\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\"><id>tag:blogger.com,1999:blog-32454861</id><updated>2007-07-31T21:44:09.867+02:00</upd"},
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atomfull.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://purl.org/atom/ns#\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\" version=\"0.3\">\r\n <title>Go Fug Yourself</title>\r\n <link rel=\"alternate\" type=\"text/html\" href=\"http://go"},
{"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/rss2full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><rss xmlns:creativeCommons=\"http://backend.userland.com/creativeCommonsRssModule\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\" version=\"2.0\"><channel><title>Google Operating System</title><link>http://googlesystem.blogspot.com/</link>"},
{"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"\"?>\n<!-- generator=\"wordpress/MU\" -->\n<rss version=\"2.0\"\n\txmlns:content=\"http://purl.org/rss/1.0/modules/content/\"\n\txmlns:wfw=\"http://wellformedweb.org/CommentAPI/\"\n\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n\t><channel>\n\t<title>Nunublog</title>\n\t<link>http://nunubh.wordpress.com</link>\n\t<description>Just Newbie Blog!</description>\n\t<pubDate>Mon, 09 Jul 2007 18:54:09 +0000</pubDate>\n\t<generator>http://wordpress.org/?v=MU</generator>\n\t<language>id</language>\n\t\t\t<item>\n\t\t<ti"},
{"type": "text/html", "input": "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<HEAD>\r\n<TITLE>Design*Sponge</TITLE><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"MSSmartTagsPreventParsing\" content=\"true\" />\r\n<meta name=\"generator\" content=\"Blogger\" />\r\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"Design*Sponge - Atom\" href=\"http://designsponge.blogspot.com/feeds/posts/default\" />\r\n<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Design*Sponge - RSS\" href="},
{"type": "text/html", "input": "<HTML>\n<HEAD>\n<TITLE>Moved Temporarily</TITLE>\n</HEAD>\n<BODY BGCOLOR=\"#FFFFFF\" TEXT=\"#000000\">\n<H1>Moved Temporarily</H1>\nThe document has moved <A HREF=\"http://feeds.feedburner.com/thesecretdiaryofstevejobs\">here</A>.\n</BODY>\n</HTML>\n"}
]

View file

@ -11,12 +11,24 @@
"input":"foo</bar>", "input":"foo</bar>",
"output":[["Character", "foo"], ["EndTag", "bar"]]}, "output":[["Character", "foo"], ["EndTag", "bar"]]},
{"description":"End tag closing RCDATA or CDATA (case-insensitivity)",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"bar",
"input":"foo</bAr>",
"output":[["Character", "foo"], ["EndTag", "bar"]]},
{"description":"End tag with incorrect name in RCDATA or CDATA", {"description":"End tag with incorrect name in RCDATA or CDATA",
"contentModelFlags":["RCDATA", "CDATA"], "contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"baz", "lastStartTag":"baz",
"input":"</foo>bar</baz>", "input":"</foo>bar</baz>",
"output":[["Character", "</foo>bar"], ["EndTag", "baz"]]}, "output":[["Character", "</foo>bar"], ["EndTag", "baz"]]},
{"description":"End tag with incorrect name in RCDATA or CDATA (starting like correct name)",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"baz",
"input":"</foo>bar</bazaar>",
"output":[["Character", "</foo>bar</bazaar>"]]},
{"description":"End tag closing RCDATA or CDATA, switching back to PCDATA", {"description":"End tag closing RCDATA or CDATA, switching back to PCDATA",
"contentModelFlags":["RCDATA", "CDATA"], "contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"bar", "lastStartTag":"bar",

File diff suppressed because it is too large Load diff

View file

@ -135,7 +135,7 @@
{"description":"Entity without trailing semicolon (2)", {"description":"Entity without trailing semicolon (2)",
"input":"I'm &notin", "input":"I'm &notin",
"output":[["Character","I'm "], "ParseError", ["Character", ""]]}, "output":[["Character","I'm "], "ParseError", ["Character", "¬in"]]},
{"description":"Partial entity match at end of file", {"description":"Partial entity match at end of file",
"input":"I'm &no", "input":"I'm &no",
@ -151,6 +151,22 @@
{"description":"Hexadecimal entity in attribute", {"description":"Hexadecimal entity in attribute",
"input":"<h a='&#x3f;'></h>", "input":"<h a='&#x3f;'></h>",
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]} "output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]},
{"description":"Entity in attribute without semicolon ending in x",
"input":"<h a='&notx'>",
"output":["ParseError", ["StartTag", "h", {"a":"&notx"}]]},
{"description":"Entity in attribute without semicolon ending in 1",
"input":"<h a='&not1'>",
"output":["ParseError", ["StartTag", "h", {"a":"&not1"}]]},
{"description":"Entity in attribute without semicolon ending in i",
"input":"<h a='&noti'>",
"output":["ParseError", ["StartTag", "h", {"a":"&noti"}]]},
{"description":"Entity in attribute without semicolon",
"input":"<h a='&COPY'>",
"output":["ParseError", ["StartTag", "h", {"a":"©"}]]}
]} ]}

View file

@ -42,27 +42,23 @@
{"description":"Numeric entity representing the NUL character", {"description":"Numeric entity representing the NUL character",
"input":"&#0000;", "input":"&#0000;",
"output":[["Character", "\uFFFD"]]}, "output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing the NUL character", {"description":"Hexadecimal entity representing the NUL character",
"input":"&#x0000;", "input":"&#x0000;",
"output":[["Character", "\uFFFD"]]}, "output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)", {"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#2225222;", "input":"&#2225222;",
"output":[["Character", "\uFFFD"]]}, "output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)", {"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#x1010FFFF;", "input":"&#x1010FFFF;",
"output":[["Character", "\uFFFD"]]}, "output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Numeric entity representing a Windows-1252 'codepoint'", {"description":"Hexadecimal entity pair representing a surrogate pair",
"input":"&#137;", "input":"&#xD869;&#xDED6;",
"output":["ParseError", ["Character", "\u2030"]]}, "output":["ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing a Windows-1252 'codepoint'",
"input":"&#x89;",
"output":["ParseError", ["Character", "\u2030"]]},
{"description":"Hexadecimal entity with mixed uppercase and lowercase", {"description":"Hexadecimal entity with mixed uppercase and lowercase",
"input":"&#xaBcD;", "input":"&#xaBcD;",
@ -118,7 +114,15 @@
{"description":"Null Byte Replacement", {"description":"Null Byte Replacement",
"input":"\u0000", "input":"\u0000",
"output":[["Character", "\ufffd"]]} "output":["ParseError", ["Character", "\ufffd"]]},
{"description":"Comment with dash",
"input":"<!---x",
"output":["ParseError", ["Comment", "-x"]]},
{"description":"Entity + newline",
"input":"\nx\n&gt;\n",
"output":[["Character","\nx\n>\n"]]}
]} ]}

View file

@ -0,0 +1,367 @@
{"tests": [
{"description":"<",
"input":"<",
"output":["ParseError", ["Character", "<"]]},
{"description":"<>",
"input":"<>",
"output":["ParseError", ["Character", "<>"]]},
{"description":"<!",
"input":"<!",
"output":["ParseError", ["Comment", ""]]},
{"description":"<!>",
"input":"<!>",
"output":["ParseError", ["Comment", ""]]},
{"description":"<!--",
"input":"<!--",
"output":["ParseError", ["Comment", ""]]},
{"description":"<!-->",
"input":"<!-->",
"output":["ParseError", ["Comment", ""]]},
{"description":"<!---",
"input":"<!---",
"output":["ParseError", ["Comment", ""]]},
{"description":"<!--->",
"input":"<!--->",
"output":["ParseError", ["Comment", ""]]},
{"description":"<!---->",
"input":"<!---->",
"output":[["Comment", ""]]},
{"description":"<!-----",
"input":"<!-----",
"output":["ParseError", "ParseError", ["Comment", "-"]]},
{"description":"<!----.",
"input":"<!----.",
"output":["ParseError", "ParseError", ["Comment", "--."]]},
{"description":"<!---?",
"input":"<!---?",
"output":["ParseError", ["Comment", "-?"]]},
{"description":"<!--?-",
"input":"<!--?-",
"output":["ParseError", ["Comment", "?"]]},
{"description":"<!--?--",
"input":"<!--?--",
"output":["ParseError", ["Comment", "?"]]},
{"description":"<!--?-.",
"input":"<!--?-.",
"output":["ParseError", ["Comment", "?-."]]},
{"description":"<!--?.",
"input":"<!--?.",
"output":["ParseError", ["Comment", "?."]]},
{"description":"<?>",
"input":"<?>",
"output":["ParseError", ["Comment", "?"]]},
{"description":"<??",
"input":"<??",
"output":["ParseError", ["Comment", "??"]]},
{"description":"</",
"input":"</",
"output":["ParseError", ["Character", "</"]]},
{"description":"</>",
"input":"</>",
"output":["ParseError"]},
{"description":"</?",
"input":"</?",
"output":["ParseError", ["Comment", "?"]]},
{"description":">",
"input":">",
"output":[["Character", ">"]]},
{"description":"-",
"input":"-",
"output":[["Character", "-"]]},
{"description":"?",
"input":"?",
"output":[["Character", "?"]]},
{"description":"&",
"input":"&",
"output":[["Character", "&"]]},
{"description":"&#",
"input":"&#",
"output":["ParseError", ["Character", "&#"]]},
{"description":"&#9",
"input":"&#9",
"output":["ParseError", ["Character", "\t"]]},
{"description":"<!doctype >",
"input":"<!doctype >",
"output":["ParseError", ["DOCTYPE", "", null, null, false]]},
{"description":"<!doctype ",
"input":"<!doctype ",
"output":["ParseError", ["DOCTYPE", "", null, null, false]]},
{"description":"<!doctype!>",
"input":"<!doctype!>",
"output":["ParseError", ["DOCTYPE", "!", null, null, true]]},
{"description":"<!doctype! >",
"input":"<!doctype! >",
"output":["ParseError", ["DOCTYPE", "!", null, null, true]]},
{"description":"<!doctype! ",
"input":"<!doctype! ",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! ?>",
"input":"<!doctype! ?>",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! ??",
"input":"<!doctype! ??",
"output":["ParseError", "ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype!?",
"input":"<!doctype!?",
"output":["ParseError", "ParseError", ["DOCTYPE", "!?", null, null, false]]},
{"description":"<!doctype! public>",
"input":"<!doctype! public>",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! public ",
"input":"<!doctype! public ",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! public?",
"input":"<!doctype! public?",
"output":["ParseError", "ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! public''",
"input":"<!doctype! public''",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", "", null, false]]},
{"description":"<!doctype! public'(",
"input":"<!doctype! public'(",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", "(", null, false]]},
{"description":"<!doctype! public\"\">",
"input":"<!doctype! public\"\">",
"output":["ParseError", ["DOCTYPE", "!", "", null, true]]},
{"description":"<!doctype! public\"\" ",
"input":"<!doctype! public\"\" ",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", "", null, false]]},
{"description":"<!doctype! public\"\"?",
"input":"<!doctype! public\"\"?",
"output":["ParseError", "ParseError", "ParseError", ["DOCTYPE", "!", "", null, false]]},
{"description":"<!doctype! public\"\"'",
"input":"<!doctype! public\"\"'",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", "", "", false]]},
{"description":"<!doctype! public\"\"\"",
"input":"<!doctype! public\"\"\"",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", "", "", false]]},
{"description":"<!doctype! public\"#",
"input":"<!doctype! public\"#",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", "#", null, false]]},
{"description":"<!doctype! system>",
"input":"<!doctype! system>",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! system ",
"input":"<!doctype! system ",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! system?",
"input":"<!doctype! system?",
"output":["ParseError", "ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! system''",
"input":"<!doctype! system''",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, "", false]]},
{"description":"<!doctype! system'(",
"input":"<!doctype! system'(",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, "(", false]]},
{"description":"<!doctype! system\"\">",
"input":"<!doctype! system\"\">",
"output":["ParseError", ["DOCTYPE", "!", null, "", true]]},
{"description":"<!doctype! system\"\" ",
"input":"<!doctype! system\"\" ",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, "", false]]},
{"description":"<!doctype! system\"\"?",
"input":"<!doctype! system\"\"?",
"output":["ParseError", "ParseError", "ParseError", ["DOCTYPE", "!", null, "", false]]},
{"description":"<!doctype! system\"#",
"input":"<!doctype! system\"#",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, "#", false]]},
{"description":"</z",
"input":"</z",
"output":["ParseError", ["EndTag", "z"]]},
{"description":"<z>",
"input":"<z>",
"output":[["StartTag", "z", {}]]},
{"description":"<z ",
"input":"<z ",
"output":["ParseError", ["StartTag", "z", {}]]},
{"description":"<z/>",
"input":"<z/>",
"output":["ParseError", ["StartTag", "z", {}]]},
{"description":"<z/ ",
"input":"<z/ ",
"output":["ParseError", "ParseError", ["StartTag", "z", {}]]},
{"description":"<z//",
"input":"<z//",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {}]]},
{"description":"<z",
"input":"<z",
"output":["ParseError", ["StartTag", "z", {}]]},
{"description":"</z",
"input":"</z",
"output":["ParseError", ["EndTag", "z"]]},
{"description":"<z0",
"input":"<z0",
"output":["ParseError", ["StartTag", "z0", {}]]},
{"description":"<z/0=>",
"input":"<z/0=>",
"output":["ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0= ",
"input":"<z/0= ",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0=?>",
"input":"<z/0=?>",
"output":["ParseError", ["StartTag", "z", {"0": "?"}]]},
{"description":"<z/0=? ",
"input":"<z/0=? ",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "?"}]]},
{"description":"<z/0=??",
"input":"<z/0=??",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "??"}]]},
{"description":"<z/0=''",
"input":"<z/0=''",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0='&",
"input":"<z/0='&",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "&"}]]},
{"description":"<z/0='%",
"input":"<z/0='%",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "%"}]]},
{"description":"<z/0=\"'",
"input":"<z/0=\"'",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "'"}]]},
{"description":"<z/0=\"\"",
"input":"<z/0=\"\"",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0=\"&",
"input":"<z/0=\"&",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "&"}]]},
{"description":"<z/0=&",
"input":"<z/0=&",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "&"}]]},
{"description":"<z/0>",
"input":"<z/0>",
"output":["ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0 =",
"input":"<z/0 =",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0 >",
"input":"<z/0 >",
"output":["ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0 ",
"input":"<z/0 ",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0 /",
"input":"<z/0 /",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0/",
"input":"<z/0/",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/00",
"input":"<z/00",
"output":["ParseError", "ParseError", ["StartTag", "z", {"00": ""}]]},
{"description":"<z/0 0",
"input":"<z/0 0",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0='&#9",
"input":"<z/0='&#9",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"0": "\t"}]]},
{"description":"<z/0=\"&#9",
"input":"<z/0=\"&#9",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"0": "\t"}]]},
{"description":"<z/0=&#9",
"input":"<z/0=&#9",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"0": "\t"}]]},
{"description":"<z/0z",
"input":"<z/0z",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0z": ""}]]},
{"description":"<z/0 z",
"input":"<z/0 z",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "", "z": ""}]]},
{"description":"<zz",
"input":"<zz",
"output":["ParseError", ["StartTag", "zz", {}]]},
{"description":"<z/z",
"input":"<z/z",
"output":["ParseError", "ParseError", ["StartTag", "z", {"z": ""}]]}
]}

View file

@ -0,0 +1,198 @@
{"tests": [
{"description":"< in attribute name",
"input":"<z/0 <",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "", "<": ""}]]},
{"description":"< in attribute value",
"input":"<z x=<",
"output":["ParseError", ["StartTag", "z", {"x": "<"}]]},
{"description":"CR EOF after doctype name",
"input":"<!doctype html \r",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"CR EOF in tag name",
"input":"<z\r",
"output":["ParseError", ["StartTag", "z", {}]]},
{"description":"Zero hex numeric entity",
"input":"&#x0",
"output":["ParseError", "ParseError", ["Character", "\uFFFD"]]},
{"description":"Zero decimal numeric entity",
"input":"&#0",
"output":["ParseError", "ParseError", ["Character", "\uFFFD"]]},
{"description":"Zero-prefixed hex numeric entity",
"input":"&#x000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000041;",
"output":[["Character", "A"]]},
{"description":"Zero-prefixed decimal numeric entity",
"input":"&#000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000065;",
"output":[["Character", "A"]]},
{"description":"Empty hex numeric entities",
"input":"&#x &#X ",
"output":["ParseError", ["Character", "&#x "], "ParseError", ["Character", "&#X "]]},
{"description":"Empty decimal numeric entities",
"input":"&# &#; ",
"output":["ParseError", ["Character", "&# "], "ParseError", ["Character", "&#; "]]},
{"description":"Non-BMP numeric entity",
"input":"&#x10000;",
"output":[["Character", "\uD800\uDC00"]]},
{"description":"Maximum non-BMP numeric entity",
"input":"&#X10FFFF;",
"output":[["Character", "\uDBFF\uDFFF"]]},
{"description":"Above maximum numeric entity",
"input":"&#x110000;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"32-bit hex numeric entity",
"input":"&#x80000041;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"33-bit hex numeric entity",
"input":"&#x100000041;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"33-bit decimal numeric entity",
"input":"&#4294967361;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"65-bit hex numeric entity",
"input":"&#x10000000000000041;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"65-bit decimal numeric entity",
"input":"&#18446744073709551681;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Surrogate code point edge cases",
"input":"&#xD7FF;&#xD800;&#xD801;&#xDFFE;&#xDFFF;&#xE000;",
"output":[["Character", "\uD7FF"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD\uE000"]]},
{"description":"Uppercase start tag name",
"input":"<X>",
"output":[["StartTag", "x", {}]]},
{"description":"Uppercase end tag name",
"input":"</X>",
"output":[["EndTag", "x"]]},
{"description":"Uppercase attribute name",
"input":"<x X>",
"output":[["StartTag", "x", { "x":"" }]]},
{"description":"Tag/attribute name case edge values",
"input":"<x@AZ[`az{ @AZ[`az{>",
"output":[["StartTag", "x@az[`az{", { "@az[`az{":"" }]]},
{"description":"Duplicate different-case attributes",
"input":"<x x=1 x=2 X=3>",
"output":["ParseError", "ParseError", ["StartTag", "x", { "x":"1" }]]},
{"description":"Uppercase close tag attributes",
"input":"</x X>",
"output":["ParseError", ["EndTag", "x"]]},
{"description":"Duplicate close tag attributes",
"input":"</x x x>",
"output":["ParseError", "ParseError", ["EndTag", "x"]]},
{"description":"Permitted slash",
"input":"<br/>",
"output":[["StartTag", "br", {}]]},
{"description":"Non-permitted slash",
"input":"<xr/>",
"output":["ParseError", ["StartTag", "xr", {}]]},
{"description":"Permitted slash but in close tag",
"input":"</br/>",
"output":["ParseError", ["EndTag", "br"]]},
{"description":"Doctype public case-sensitivity (1)",
"input":"<!DoCtYpE HtMl PuBlIc \"AbC\" \"XyZ\">",
"output":[["DOCTYPE", "HtMl", "AbC", "XyZ", true]]},
{"description":"Doctype public case-sensitivity (2)",
"input":"<!dOcTyPe hTmL pUbLiC \"aBc\" \"xYz\">",
"output":[["DOCTYPE", "hTmL", "aBc", "xYz", true]]},
{"description":"Doctype system case-sensitivity (1)",
"input":"<!DoCtYpE HtMl SyStEm \"XyZ\">",
"output":[["DOCTYPE", "HtMl", null, "XyZ", true]]},
{"description":"Doctype system case-sensitivity (2)",
"input":"<!dOcTyPe hTmL sYsTeM \"xYz\">",
"output":[["DOCTYPE", "hTmL", null, "xYz", true]]},
{"description":"U+0000 in lookahead region after non-matching character",
"input":"<!doc>\u0000",
"output":["ParseError", ["Comment", "doc"], "ParseError", ["Character", "\uFFFD"]],
"ignoreErrorOrder":true},
{"description":"U+0000 in lookahead region",
"input":"<!doc\u0000",
"output":["ParseError", "ParseError", ["Comment", "doc\uFFFD"]],
"ignoreErrorOrder":true},
{"description":"CR followed by U+0000",
"input":"\r\u0000",
"output":["ParseError", ["Character", "\n\uFFFD"]],
"ignoreErrorOrder":true},
{"description":"CR followed by non-LF",
"input":"\r?",
"output":[["Character", "\n?"]]},
{"description":"CR at EOF",
"input":"\r",
"output":[["Character", "\n"]]},
{"description":"LF at EOF",
"input":"\n",
"output":[["Character", "\n"]]},
{"description":"CR LF",
"input":"\r\n",
"output":[["Character", "\n"]]},
{"description":"CR CR",
"input":"\r\r",
"output":[["Character", "\n\n"]]},
{"description":"LF LF",
"input":"\n\n",
"output":[["Character", "\n\n"]]},
{"description":"LF CR",
"input":"\n\r",
"output":[["Character", "\n\n"]]},
{"description":"text CR CR CR text",
"input":"text\r\r\rtext",
"output":[["Character", "text\n\n\ntext"]]},
{"description":"Doctype publik",
"input":"<!DOCTYPE html PUBLIK \"AbC\" \"XyZ\">",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Doctype publi",
"input":"<!DOCTYPE html PUBLI",
"output":["ParseError", "ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Doctype sistem",
"input":"<!DOCTYPE html SISTEM \"AbC\">",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Doctype sys",
"input":"<!DOCTYPE html SYS",
"output":["ParseError", "ParseError", ["DOCTYPE", "html", null, null, false]]}
]}

View file

@ -113,7 +113,6 @@ Line1<br>Line2<br>Line3<br>Line4
<html><head></body></html> <html><head></body></html>
#errors #errors
6: missing document type declaration 6: missing document type declaration
19: unexpected body element end tag in head
#document #document
| <html> | <html>
| <head> | <head>
@ -159,7 +158,6 @@ Line1<br>Line2<br>Line3<br>Line4
</head> </head>
#errors #errors
7: missing document type declaration 7: missing document type declaration
7: unexpected head element end tag
#document #document
| <html> | <html>
| <head> | <head>
@ -169,7 +167,6 @@ Line1<br>Line2<br>Line3<br>Line4
</body> </body>
#errors #errors
7: missing document type declaration 7: missing document type declaration
7: unexpected body element end tag
#document #document
| <html> | <html>
| <head> | <head>
@ -285,6 +282,7 @@ Line1<br>Line2<br>Line3<br>Line4
| <div> | <div>
| <b> | <b>
| <marquee> | <marquee>
| <p>
| "X" | "X"
#data #data
@ -330,6 +328,7 @@ Unexpected end of file
| <body> | <body>
| <p> | <p>
| <hr> | <hr>
| <p>
#data #data
<select><b><option><select><option></b></select>X <select><b><option><select><option></b></select>X
@ -435,6 +434,7 @@ Unexpected end of file
#data #data
<!DOCTYPE HTML><li>hello<li>world<ul>how<li>do</ul>you</body><!--do--> <!DOCTYPE HTML><li>hello<li>world<ul>how<li>do</ul>you</body><!--do-->
#errors #errors
Unexpected end of file. Expected </li>. XXX
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
| <html> | <html>
@ -636,7 +636,6 @@ Unexpected end of file
#data #data
<!DOCTYPE HTML><script> <!-- </script> --> </script> EOF <!DOCTYPE HTML><script> <!-- </script> --> </script> EOF
#errors #errors
52: unexpected script element end tag
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
| <html> | <html>
@ -730,6 +729,7 @@ Unexpected end of file
#errors #errors
6: missing document type declaration 6: missing document type declaration
29: mismatched font element end tag (misnested tags) 29: mismatched font element end tag (misnested tags)
AAA </font> tag strikes again
35: mismatched body element end tag (premature end of file?) 35: mismatched body element end tag (premature end of file?)
#document #document
| <html> | <html>
@ -1120,6 +1120,7 @@ Unexpected end of file
15: missing document type declaration 15: missing document type declaration
39: unexpected node in table context 39: unexpected node in table context
39: a element start tag implying a element end tag 39: a element start tag implying a element end tag
AAA violation: </a>
39: unexpected node in table context 39: unexpected node in table context
39: mismatched a element end tag (misnested tags across <table> tag) 39: mismatched a element end tag (misnested tags across <table> tag)
43: unexpected node in table context 43: unexpected node in table context
@ -1175,6 +1176,8 @@ Unexpected end of file
7: missing document type declaration 7: missing document type declaration
22: unexpected node in table context 22: unexpected node in table context
27: unexpected node in table context 27: unexpected node in table context
XXX more table voodoo
XXX more table voodoo
54: unexpected td element end tag implied other end tags 54: unexpected td element end tag implied other end tags
63: unexpected node in table context 63: unexpected node in table context
72: mismatched body element end tag (premature end of file?) 72: mismatched body element end tag (premature end of file?)
@ -1299,11 +1302,9 @@ unexpected EOF
#errors #errors
6: missing document type declaration 6: missing document type declaration
12: unexpected body element start tag 12: unexpected body element start tag
18: base element start tag out of place
24: link element start tag out of place
30: meta element start tag out of place
37: title element start tag out of place 37: title element start tag out of place
54: unexpected body element start tag 54: unexpected body element start tag
Missing end tag </p>. XXX
#document #document
| <html> | <html>
| <head> | <head>
@ -1344,7 +1345,6 @@ unexpected EOF
3: missing document type declaration 3: missing document type declaration
13: unexpected node in table context 13: unexpected node in table context
13: a element start tag implying a element end tag 13: a element start tag implying a element end tag
13: unexpected node in table context
13: mismatched a element end tag (misnested tags across <table> tag) 13: mismatched a element end tag (misnested tags across <table> tag)
21: mismatched table element end tag 21: mismatched table element end tag
27: a element start tag implying a element end tag 27: a element start tag implying a element end tag
@ -1369,13 +1369,14 @@ unexpected EOF
<head></p><meta><p> <head></p><meta><p>
#errors #errors
6: missing document type declaration 6: missing document type declaration
10: unexpected p element end tag in head 10: unexpected p element end tag
#document #document
| <html> | <html>
| <head> | <head>
| <meta>
| <body> | <body>
| <p> | <p>
| <meta>
| <p>
#data #data
<head></html><meta><p> <head></html><meta><p>
@ -1485,6 +1486,7 @@ unexpected EOF
| <div> | <div>
| <b> | <b>
| <marquee> | <marquee>
| <p>
#data #data
<script></script></div><title></title><p><p> <script></script></div><title></title><p><p>
@ -1511,6 +1513,7 @@ unexpected EOF
| <body> | <body>
| <p> | <p>
| <hr> | <hr>
| <p>
#data #data
<select><b><option><select><option></b></select> <select><b><option><select><option></b></select>
@ -1571,6 +1574,8 @@ unexpected EOF
<ul><li></li><div><li></div><li><li><div><li><address><li><b><em></b><li></ul> <ul><li></li><div><li></div><li><li><div><li><address><li><b><em></b><li></ul>
#errors #errors
4: missing document type declaration 4: missing document type declaration
Missing end tag for <div> (nr2)
Missing end tag for <address>
69: mismatched b element end tag (misnested tags) 69: mismatched b element end tag (misnested tags)
#document #document
| <html> | <html>
@ -1615,7 +1620,6 @@ unexpected EOF
56: unexpected frameset element start tag in body 56: unexpected frameset element start tag in body
63: unexpected frame element start tag in body 63: unexpected frame element start tag in body
74: unexpected frameset element end tag 74: unexpected frameset element end tag
87: unescaped '</' in CDATA or RCDATA block
106: unexpected end of file while parsing CDATA section for element noframes 106: unexpected end of file while parsing CDATA section for element noframes
#document #document
| <html> | <html>
@ -1630,6 +1634,7 @@ unexpected EOF
4: missing document type declaration 4: missing document type declaration
15: required tr element start tag implied by unexpected td element start tag 15: required tr element start tag implied by unexpected td element start tag
27: unexpected td element end tag implied other end tags 27: unexpected td element end tag implied other end tags
Unexpected </h1> tag. Expected other.
Unexpected EOF Unexpected EOF
#document #document
| <html> | <html>
@ -1737,9 +1742,9 @@ Unexpected EOF
108: unexpected h4 element end tag 108: unexpected h4 element end tag
113: unexpected h5 element end tag 113: unexpected h5 element end tag
118: unexpected h6 element end tag 118: unexpected h6 element end tag
125: unexpected body element end tag 125: unexpected end tag token br in after body phase
130: unexpected br element end tag 130: unexpected br element end tag
134: unexpected a element end tag 134: unexpected a element end tag (AAA)
140: unexpected img element end tag 140: unexpected img element end tag
148: unexpected title element end tag 148: unexpected title element end tag
155: unexpected span element end tag 155: unexpected span element end tag
@ -1807,6 +1812,7 @@ Unexpected EOF
| <head> | <head>
| <body> | <body>
| <br> | <br>
| <p>
#data #data
<table><tr></strong></b></em></i></u></strike></s></blink></tt></pre></big></small></font></select></h1></h2></h3></h4></h5></h6></body></br></a></img></title></span></style></script></table></th></td></tr></frame></area></link></param></hr></input></col></base></meta></basefont></bgsound></embed></spacer></p></dd></dt></caption></colgroup></tbody></tfoot></thead></address></blockquote></center></dir></div></dl></fieldset></listing></menu></ol></ul></li></nobr></wbr></form></button></marquee></object></html></frameset></head></iframe></image></isindex></noembed></noframes></noscript></optgroup></option></plaintext></textarea> <table><tr></strong></b></em></i></u></strike></s></blink></tt></pre></big></small></font></select></h1></h2></h3></h4></h5></h6></body></br></a></img></title></span></style></script></table></th></td></tr></frame></area></link></param></hr></input></col></base></meta></basefont></bgsound></embed></spacer></p></dd></dt></caption></colgroup></tbody></tfoot></thead></address></blockquote></center></dir></div></dl></fieldset></listing></menu></ol></ul></li></nobr></wbr></form></button></marquee></object></html></frameset></head></iframe></image></isindex></noembed></noframes></noscript></optgroup></option></plaintext></textarea>
@ -1920,6 +1926,9 @@ Unexpected EOF
610: unexpected option element end tag 610: unexpected option element end tag
622: unexpected plaintext element end tag 622: unexpected plaintext element end tag
633: mismatched special end tag textarea 633: mismatched special end tag textarea
XXX
XXX
XXX
#document #document
| <html> | <html>
| <head> | <head>
@ -1928,3 +1937,14 @@ Unexpected EOF
| <table> | <table>
| <tbody> | <tbody>
| <tr> | <tr>
| <p>
#data
<frameset>
#errors
10: Start tag seen without seeing a doctype first.
11: End of file seen and there were open elements.
#document
| <html>
| <head>
| <frameset>

View file

@ -12,7 +12,6 @@
<textarea>test</div>test <textarea>test</div>test
#errors #errors
10: missing document type declaration. 10: missing document type declaration.
17: unescaped '</' in CDATA or RCDATA block.
25: unexpected end of file while parsing CDATA section for element textarea. 25: unexpected end of file while parsing CDATA section for element textarea.
#document #document
| <html> | <html>
@ -87,6 +86,8 @@ Expected end tag </frameset>
#data #data
<!DOCTYPE HTML><font><p><b>test</font> <!DOCTYPE HTML><font><p><b>test</font>
#errors #errors
AAA violation. </font>
AAA violation. </font>
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
| <html> | <html>
@ -101,6 +102,7 @@ Expected end tag </frameset>
#data #data
<!DOCTYPE HTML><dt><div><dd> <!DOCTYPE HTML><dt><div><dd>
#errors #errors
Missing end tag for <div>.
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
| <html> | <html>
@ -114,7 +116,6 @@ Expected end tag </frameset>
<script></x <script></x
#errors #errors
no document type no document type
</ in script
Unexpected end of file. Expected </script> end tag. Unexpected end of file. Expected </script> end tag.
#document #document
| <html> | <html>
@ -129,6 +130,7 @@ Unexpected end of file. Expected </script> end tag.
no document type no document type
<plaintext> directly inside table <plaintext> directly inside table
Characters inside table. Characters inside table.
Characters inside table. (XXX?)
Unexpected end of file. Unexpected end of file.
#document #document
| <html> | <html>
@ -175,10 +177,10 @@ Unexpected start tag "body"
| <html> | <html>
| <head> | <head>
| <body> | <body>
| t4="4" | t1="1"
| t2="2" | t2="2"
| t3="3" | t3="3"
| t1="1" | t4="4"
#data #data
</b test </b test
@ -195,7 +197,6 @@ Unexpected end tag.
#data #data
<!DOCTYPE HTML></b test<b &=&amp>X <!DOCTYPE HTML></b test<b &=&amp>X
#errors #errors
Unexpected < in attribute
End tag contains attributes. End tag contains attributes.
Unexpected end tag. Unexpected end tag.
Named entity didn't end with ; Named entity didn't end with ;
@ -224,7 +225,6 @@ Unexpected EOF in (end) tag name
& &
#errors #errors
No doctype. No doctype.
Unfinished entity.
#document #document
| <html> | <html>
| <head> | <head>
@ -349,11 +349,11 @@ Unexpected end EOF. Missing closing tags.
| <b> | <b>
| <i> | <i>
| <u> | <u>
| " "
| <p>
| <b> | <b>
| <i> | <i>
| <u> | <u>
| " "
| <p>
| "X" | "X"
#data #data
@ -538,10 +538,10 @@ No doctype
| <hr> | <hr>
| <p> | <p>
| <label> | <label>
| "This is a searchable index. Insert your search keywords here:" | "This is a searchable index. Insert your search keywords here: "
| <input> | <input>
| test="x"
| name="isindex" | name="isindex"
| test="x"
| <hr> | <hr>
#data #data
@ -571,19 +571,18 @@ Unexpected EOF.
| <b> | <b>
| <i> | <i>
| <u> | <u>
| "
"
| <p>
| <b> | <b>
| <i> | <i>
| <u> | <u>
| "
"
| <p>
| "X" | "X"
#data #data
<!DOCTYPE HTML><body><title>test</body></title> <!DOCTYPE HTML><body><title>test</body></title>
#errors #errors
Unexpected start tag that belongs in the head. Unexpected start tag that belongs in the head.
Expected closing tag after </.
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
| <html> | <html>
@ -596,10 +595,7 @@ Expected closing tag after </.
<!DOCTYPE HTML><body><title>X</title><meta name=z><link rel=foo><style> <!DOCTYPE HTML><body><title>X</title><meta name=z><link rel=foo><style>
x { content:"</style" } </style> x { content:"</style" } </style>
#errors #errors
Unexpected start tag that belongs in head. Unexpected start tag that belongs in head. <title>
Unexpected start tag that belongs in head.
Unexpected start tag that belongs in head.
Expected closing tag after </.
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
| <html> | <html>
@ -632,8 +628,6 @@ x { content:"</style" } "
#errors #errors
No doctype. No doctype.
#document #document
| "
"
| <html> | <html>
| <head> | <head>
| <body> | <body>
@ -643,7 +637,6 @@ No doctype.
#errors #errors
#document #document
| <!DOCTYPE HTML> | <!DOCTYPE HTML>
| " "
| <html> | <html>
| <head> | <head>
| <body> | <body>
@ -749,8 +742,8 @@ Solidus (/) incorrectly placed.
| <body> | <body>
| "X" | "X"
| <p> | <p>
| y=""
| x="" | x=""
| y=""
| z="" | z=""
#data #data
@ -777,3 +770,4 @@ Unexpected </p> end tag.
| <tbody> | <tbody>
| <tr> | <tr>
| <td> | <td>
| <p>

View file

@ -61,7 +61,6 @@ No DOCTYPE
#data #data
<!DOCTYPE htML><html><head></head><body><pre> <!DOCTYPE htML><html><head></head><body><pre>
foo</pre></body></html> foo</pre></body></html>
#errors #errors
#document #document
@ -72,10 +71,22 @@ foo</pre></body></html>
| <pre> | <pre>
| "foo" | "foo"
#data #data
<!DOCTYPE htML><html><head></head><body><pre> <!DOCTYPE htML><html><head></head><body><pre>
foo</pre></body></html>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <pre>
| "
foo"
#data
<!DOCTYPE htML><html><head></head><body><pre>
foo foo
</pre></body></html> </pre></body></html>
#errors #errors
@ -120,6 +131,7 @@ y"
<!DOCTYPE htML><html><head></head><body><pre>x<div> <!DOCTYPE htML><html><head></head><body><pre>x<div>
y</pre></body></html> y</pre></body></html>
#errors #errors
End tag <pre> seen too early. Expected other end tag.
#document #document
| <!DOCTYPE htML> | <!DOCTYPE htML>
| <html> | <html>
@ -129,11 +141,12 @@ y</pre></body></html>
| "x" | "x"
| <div> | <div>
| " | "
| y" y"
#data #data
<!DOCTYPE htML><HTML><META><HEAD></HEAD></HTML> <!DOCTYPE htML><HTML><META><HEAD></HEAD></HTML>
#errors #errors
Unexpected start tag HEAD in HEAD. Ignored.
#document #document
| <!DOCTYPE htML> | <!DOCTYPE htML>
| <html> | <html>
@ -144,6 +157,7 @@ y</pre></body></html>
#data #data
<!DOCTYPE htML><HTML><HEAD><head></HEAD></HTML> <!DOCTYPE htML><HTML><HEAD><head></HEAD></HTML>
#errors #errors
Unexpected start tag HEAD in HEAD. Ignored.
#document #document
| <!DOCTYPE htML> | <!DOCTYPE htML>
| <html> | <html>
@ -153,6 +167,8 @@ y</pre></body></html>
#data #data
<textarea>foo<span>bar</span><i>baz <textarea>foo<span>bar</span><i>baz
#errors #errors
Unexpected start tag. Expected DOCTYPE.
Unexpected end of file.
#document #document
| <html> | <html>
| <head> | <head>
@ -163,6 +179,8 @@ y</pre></body></html>
#data #data
<title>foo<span>bar</em><i>baz <title>foo<span>bar</em><i>baz
#errors #errors
Unexpected start tag. Expected DOCTYPE.
Unexpected end of file.
#document #document
| <html> | <html>
| <head> | <head>
@ -183,7 +201,6 @@ y</pre></body></html>
#data #data
<!DOCTYPE htML><textarea> <!DOCTYPE htML><textarea>
foo</textarea> foo</textarea>
#errors #errors
#document #document
@ -194,6 +211,20 @@ foo</textarea>
| <textarea> | <textarea>
| "foo" | "foo"
#data
<!DOCTYPE htML><textarea>
foo</textarea>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <textarea>
| "
foo"
#data #data
<!DOCTYPE htML><html><head></head><body><ul><li><div><p><li></ul></body></html> <!DOCTYPE htML><html><head></head><body><ul><li><div><p><li></ul></body></html>
#errors #errors
@ -212,6 +243,8 @@ Missing end tag (div)
#data #data
<!doctype html><nobr><nobr><nobr> <!doctype html><nobr><nobr><nobr>
#errors #errors
Unexpected <nobr> tag.
Unexpected <nobr> tag.
Unexpected end of file. Unexpected end of file.
#document #document
| <!DOCTYPE html> | <!DOCTYPE html>
@ -225,6 +258,7 @@ Unexpected end of file.
#data #data
<!doctype html><nobr><nobr></nobr><nobr> <!doctype html><nobr><nobr></nobr><nobr>
#errors #errors
Unexpected <nobr> tag.
Unexpected end of file. Unexpected end of file.
#document #document
| <!DOCTYPE html> | <!DOCTYPE html>

View file

@ -1,37 +1,50 @@
#data #data
direct div content direct div content
#errors #errors
#document-fragment div #document-fragment
div
#document
| "direct div content" | "direct div content"
#data #data
direct textarea content direct textarea content
#errors #errors
#document-fragment textarea #document-fragment
textarea
#document
| "direct textarea content" | "direct textarea content"
#data #data
textarea content with <em>pseudo</em> <foo>markup textarea content with <em>pseudo</em> <foo>markup
#errors #errors
#document-fragment textarea #document-fragment
textarea
#document
| "textarea content with <em>pseudo</em> <foo>markup" | "textarea content with <em>pseudo</em> <foo>markup"
#data #data
this is &#x0043;DATA inside a <style> element this is &#x0043;DATA inside a <style> element
#errors #errors
#document-fragment style #document-fragment
style
#document
| "this is &#x0043;DATA inside a <style> element" | "this is &#x0043;DATA inside a <style> element"
#data #data
</plaintext> </plaintext>
#errors #errors
#document-fragment plaintext #document-fragment
plaintext
#document
| "</plaintext>" | "</plaintext>"
#data #data
setting html's innerHTML setting html's innerHTML
#errors #errors
#document-fragment html XXX innerHTML EOF
#document-fragment
html
#document
| <head> | <head>
| <body> | <body>
| "setting html's innerHTML" | "setting html's innerHTML"
@ -39,6 +52,9 @@ setting html's innerHTML
#data #data
<title>setting head's innerHTML</title> <title>setting head's innerHTML</title>
#errors #errors
#document-fragment head Unexpected title element that belongs in head.
#document-fragment
head
#document
| <title> | <title>
| "setting head's innerHTML" | "setting head's innerHTML"

View file

@ -110,7 +110,6 @@ No DOCTYPE
<style> <!</-- </style>x <style> <!</-- </style>x
#errors #errors
No DOCTYPE No DOCTYPE
Unexpected end of file
#document #document
| <html> | <html>
| <head> | <head>
@ -118,3 +117,59 @@ Unexpected end of file
| " <!</-- " | " <!</-- "
| <body> | <body>
| "x" | "x"
#data
<xmp> <!-- > --> </xmp>
#errors
No DOCTYPE
#document
| <html>
| <head>
| <body>
| <xmp>
| " <!-- > --> "
#data
<title>&amp;</title>
#errors
No DOCTYPE
#document
| <html>
| <head>
| <title>
| "&"
| <body>
#data
<title><!--&amp;--></title>
#errors
No DOCTYPE
#document
| <html>
| <head>
| <title>
| "<!--&amp;-->"
| <body>
#data
<title><!--</title>
#errors
No DOCTYPE
Unexpected EOF
#document
| <html>
| <head>
| <title>
| "<!--</title>"
| <body>
#data
<noscript><!--</noscript>--></noscript>
#errors
No DOCTYPE
#document
| <html>
| <head>
| <noscript>
| "<!--</noscript>-->"
| <body>

View file

@ -1,6 +1,7 @@
#data #data
<!doctype html></head> <head> <!doctype html></head> <head>
#errors #errors
Unexpected start tag head. Ignored.
#document #document
| <!DOCTYPE html> | <!DOCTYPE html>
| <html> | <html>
@ -11,6 +12,9 @@
#data #data
<!doctype html></html> <head> <!doctype html></html> <head>
#errors #errors
Unexpected start tag head.
Unexpected start tag head in after body phase.
Unexpected start tag head. Ignored.
#document #document
| <!DOCTYPE html> | <!DOCTYPE html>
| <html> | <html>
@ -21,9 +25,69 @@
#data #data
<!doctype html></body><meta> <!doctype html></body><meta>
#errors #errors
Unexpected meta element in after body phase.
#document #document
| <!DOCTYPE html> | <!DOCTYPE html>
| <html> | <html>
| <head> | <head>
| <body> | <body>
| <meta> | <meta>
#data
<!doctype HTml><form><div></form><div>
#errors
Form end tag ignored.
Unexpected end of file.
#document
| <!DOCTYPE HTml>
| <html>
| <head>
| <body>
| <form>
| <div>
| <div>
#data
<!doctype HTml><title>&amp;</title>
#errors
#document
| <!DOCTYPE HTml>
| <html>
| <head>
| <title>
| "&"
| <body>
#data
<!doctype HTml><title><!--&amp;--></title>
#errors
#document
| <!DOCTYPE HTml>
| <html>
| <head>
| <title>
| "<!--&amp;-->"
| <body>
#data
<!doctype>
#errors
No space after "doctype"
Unexpected ">"
Incorrect doctype
#document
| <!DOCTYPE >
| <html>
| <head>
| <body>
#data
<!---x
#errors
End of file in comment
End of file before doctype
#document
| <!-- -x -->
| <html>
| <head>
| <body>

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,159 @@
{"tests": [
{"description": "valid single class attribute value",
"input": "<span class=a>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading space",
"input": "<span class=' a'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with trailing space",
"input": "<span class='a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading and trailing space",
"input": "<span class=' a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading tab",
"input": "<span class=' a'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with trailing tab",
"input": "<span class='a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading and trailing tab",
"input": "<span class=' a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading LF",
"input": "<span class='
a'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with trailing LF",
"input": "<span class='a
'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading and trailing LF",
"input": "<span class='
a
'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading LT",
"input": "<span class=' a'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with trailing LT",
"input": "<span class='a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading and trailing LT",
"input": "<span class=' a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading FF",
"input": "<span class=' a'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with trailing FF",
"input": "<span class='a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading and trailing FF",
"input": "<span class=' a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading CR",
"input": "<span class=' a'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with trailing CR",
"input": "<span class='a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading and trailing CR",
"input": "<span class=' a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid double class attribute value separated by space",
"input": "<span class='a b'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid double class attribute value separated by tab",
"input": "<span class='a b'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid double class attribute value separated by LF",
"input": "<span class='a
b'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid double class attribute value separated by LT",
"input": "<span class='a b'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid double class attribute value separated by FF",
"input": "<span class='a b'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid double class attribute value separated by CR",
"input": "<span class='a b'>",
"fail-if": "invalid-attribute-value"},
{"description": "invalid duplicated class attribute value separated by space",
"input": "<span class='a a'>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid duplicated class attribute value separated by tab",
"input": "<span class='a a'>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid duplicated class attribute value separated by LF",
"input": "<span class='a
a'>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid duplicated class attribute value separated by LT",
"input": "<span class='a a'>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid duplicated class attribute value separated by FF",
"input": "<span class='a a'>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid duplicated class attribute value separated by CR",
"input": "<span class='a a'>",
"fail-unless": "duplicate-value-in-token-list"},
{"description": "invalid duplicated class attribute value separated by space",
"input": "<span class='a a'>",
"fail-unless": "duplicate-value-in-token-list"},
{"description": "invalid duplicated class attribute value separated by tab",
"input": "<span class='a a'>",
"fail-unless": "duplicate-value-in-token-list"},
{"description": "invalid duplicated class attribute value separated by LF",
"input": "<span class='a
a'>",
"fail-unless": "duplicate-value-in-token-list"},
{"description": "invalid duplicated class attribute value separated by LT",
"input": "<span class='a a'>",
"fail-unless": "duplicate-value-in-token-list"},
{"description": "invalid duplicated class attribute value separated by FF",
"input": "<span class='a a'>",
"fail-unless": "duplicate-value-in-token-list"},
{"description": "invalid duplicated class attribute value separated by CR",
"input": "<span class='a a'>",
"fail-unless": "duplicate-value-in-token-list"}
]}

View file

@ -0,0 +1,59 @@
{"tests": [
{"description": "valid contenteditable attribute value 'true'",
"input": "<span contenteditable=true>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value 'TRUE'",
"input": "<span contenteditable=TRUE>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value 'TrUe'",
"input": "<span contenteditable=TrUe>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value 'false'",
"input": "<span contenteditable=false>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value 'FALSE'",
"input": "<span contenteditable=FALSE>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value 'FalSe'",
"input": "<span contenteditable=FalSe>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value ''",
"input": "<span contenteditable=''>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value (not specified)",
"input": "<span contenteditable>",
"fail-if": "invalid-attribute-value"},
{"description": "invalid contenteditable attribute value 'foo'",
"input": "<span contenteditable=foo>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid contenteditable attribute value '0'",
"input": "<span contenteditable=0>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid contenteditable attribute value '1'",
"input": "<span contenteditable=1>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid contenteditable attribute value 'yes'",
"input": "<span contenteditable=yes>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid contenteditable attribute value 'no'",
"input": "<span contenteditable=no>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid contenteditable attribute value 'inherit'",
"input": "<span contenteditable=inherit>",
"fail-unless": "invalid-attribute-value"}
]}

View file

@ -0,0 +1,118 @@
{"tests": [
{"description": "contextmenu points to valid ID earlier",
"input": "<menu id=a><span contextmenu=a>",
"fail-if": "id-does-not-exist"},
{"description": "contextmenu points to valid ID later",
"input": "<span contextmenu=a><menu id=a>",
"fail-if": "id-does-not-exist"},
{"description": "contextmenu points to non-existent ID",
"input": "<span contextmenu=a>",
"fail-unless": "id-does-not-exist"},
{"description": "contextmenu points to ID on non-menu element",
"input": "<span id=a><span contextmenu=a>",
"fail-unless": "contextmenu-must-point-to-menu"},
{"description": "uppercase contextmenu points to ID on non-menu element",
"input": "<span id=a><span CONTEXTMENU=a>",
"fail-unless": "contextmenu-must-point-to-menu"},
{"description": "valid ID 'a'",
"input": "<span contextmenu=a>",
"fail-if": "invalid-attribute-value"},
{"description": "valid ID '1'",
"input": "<span contextmenu=1>",
"fail-if": "invalid-attribute-value"},
{"description": "wacky but valid ID",
"input": "<span contextmenu='<html><head><title>a</title></head><body><p>b</p></body></html>'>",
"fail-if": "invalid-attribute-value"},
{"description": "invalid blank ID",
"input": "<span id>",
"fail-unless": "attribute-value-can-not-be-blank"},
{"description": "invalid blank ID with quotes",
"input": "<span contextmenu=''>",
"fail-unless": "attribute-value-can-not-be-blank"},
{"description": "invalid ID because of leading space",
"input": "<span contextmenu=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing space",
"input": "<span contextmenu='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of space in value",
"input": "<span contextmenu='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading tab",
"input": "<span contextmenu=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing tab",
"input": "<span contextmenu='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of tab in value",
"input": "<span contextmenu='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading LF",
"input": "<span contextmenu='
a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing LF",
"input": "<span contextmenu='a
'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of LF in value",
"input": "<span contextmenu='a
b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading LT",
"input": "<span contextmenu=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing LT",
"input": "<span contextmenu='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of LT in value",
"input": "<span contextmenu='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading FF",
"input": "<span contextmenu=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing FF",
"input": "<span contextmenu='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of FF in value",
"input": "<span contextmenu='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading CR",
"input": "<span contextmenu=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing CR",
"input": "<span contextmenu='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of CR in value",
"input": "<span contextmenu='a b'>",
"fail-unless": "space-in-id"}
]}

View file

@ -0,0 +1,118 @@
{"tests": [
{"description": "valid ID 'a'",
"input": "<span id=a>",
"fail-if": "invalid-attribute-value"},
{"description": "valid ID '1'",
"input": "<span id=1>",
"fail-if": "invalid-attribute-value"},
{"description": "wacky but valid ID",
"input": "<span id='<html><head><title>a</title></head><body><p>b</p></body></html>'>",
"fail-if": "invalid-attribute-value"},
{"description": "invalid blank ID",
"input": "<span id>",
"fail-unless": "attribute-value-can-not-be-blank"},
{"description": "invalid blank ID with quotes",
"input": "<span id=''>",
"fail-unless": "attribute-value-can-not-be-blank"},
{"description": "invalid ID because of leading space",
"input": "<span id=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing space",
"input": "<span id='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of space in value",
"input": "<span id='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading tab",
"input": "<span id=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing tab",
"input": "<span id='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of tab in value",
"input": "<span id='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading LF",
"input": "<span id='
a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing LF",
"input": "<span id='a
'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of LF in value",
"input": "<span id='a
b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading LT",
"input": "<span id=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing LT",
"input": "<span id='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of LT in value",
"input": "<span id='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading FF",
"input": "<span id=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing FF",
"input": "<span id='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of FF in value",
"input": "<span id='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading CR",
"input": "<span id=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing CR",
"input": "<span id='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of CR in value",
"input": "<span id='a b'>",
"fail-unless": "space-in-id"},
{"description": "duplicate ID values",
"input": "<span id=a><span id=a>",
"fail-unless": "duplicate-id"},
{"description": "duplicate ID values with spaces (weird but true)",
"input": "<span id='a '><span id='a '>",
"fail-unless": "duplicate-id"},
{"description": "not duplicate ID values because spaces don't match",
"input": "<span id=a><span id='a '>",
"fail-if": "duplicate-id"},
{"description": "not duplicate ID values because spaces don't match",
"input": "<span id=' a'><span id='a '>",
"fail-if": "duplicate-id"},
{"description": "not duplicate ID values because case doesn't match",
"input": "<span id=a><span id=A>",
"fail-if": "duplicate-id"}
]}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,375 @@
{"tests": [
{"description": "unknown start tag <foo>",
"input": "<foo>",
"fail-unless": "unknown-start-tag"},
{"description": "allowed start tag <code>",
"input": "<code>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <kbd>",
"input": "<kbd>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <aside>",
"input": "<aside>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <datagrid>",
"input": "<datagrid>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <font>",
"input": "<font>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <noscript>",
"input": "<noscript>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <style>",
"input": "<style>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <img>",
"input": "<img>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <title>",
"input": "<title>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <menu>",
"input": "<menu>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <tr>",
"input": "<tr>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <param>",
"input": "<param>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <li>",
"input": "<li>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <source>",
"input": "<source>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <tfoot>",
"input": "<tfoot>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <th>",
"input": "<th>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <td>",
"input": "<td>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <dl>",
"input": "<dl>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <blockquote>",
"input": "<blockquote>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <dd>",
"input": "<dd>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <abbr>",
"input": "<abbr>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <dt>",
"input": "<dt>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <noembed>",
"input": "<noembed>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <p>",
"input": "<p>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <small>",
"input": "<small>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <meter>",
"input": "<meter>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <em>",
"input": "<em>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <meta>",
"input": "<meta>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <video>",
"input": "<video>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <div>",
"input": "<div>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <canvas>",
"input": "<canvas>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <sub>",
"input": "<sub>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <section>",
"input": "<section>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <sup>",
"input": "<sup>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <progress>",
"input": "<progress>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <body>",
"input": "<body>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <base>",
"input": "<base>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <br>",
"input": "<br>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <address>",
"input": "<address>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <article>",
"input": "<article>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <strong>",
"input": "<strong>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <legend>",
"input": "<legend>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <event-source>",
"input": "<event-source>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <ol>",
"input": "<ol>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <script>",
"input": "<script>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <caption>",
"input": "<caption>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <dialog>",
"input": "<dialog>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <col>",
"input": "<col>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <h2>",
"input": "<h2>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <h3>",
"input": "<h3>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <h1>",
"input": "<h1>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <h6>",
"input": "<h6>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <h4>",
"input": "<h4>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <h5>",
"input": "<h5>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <header>",
"input": "<header>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <table>",
"input": "<table>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <span>",
"input": "<span>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <area>",
"input": "<area>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <dfn>",
"input": "<dfn>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <var>",
"input": "<var>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <cite>",
"input": "<cite>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <thead>",
"input": "<thead>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <head>",
"input": "<head>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <hr>",
"input": "<hr>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <link>",
"input": "<link>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <datatemplate>",
"input": "<datatemplate>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <b>",
"input": "<b>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <colgroup>",
"input": "<colgroup>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <ul>",
"input": "<ul>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <del>",
"input": "<del>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <iframe>",
"input": "<iframe>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <pre>",
"input": "<pre>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <figure>",
"input": "<figure>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <ins>",
"input": "<ins>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <tbody>",
"input": "<tbody>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <html>",
"input": "<html>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <nav>",
"input": "<nav>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <details>",
"input": "<details>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <samp>",
"input": "<samp>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <map>",
"input": "<map>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <nest>",
"input": "<nest>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <object>",
"input": "<object>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <a>",
"input": "<a>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <footer>",
"input": "<footer>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <i>",
"input": "<i>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <m>",
"input": "<m>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <rule>",
"input": "<rule>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <q>",
"input": "<q>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <command>",
"input": "<command>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <time>",
"input": "<time>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <audio>",
"input": "<audio>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <bdo>",
"input": "<bdo>",
"fail-if": "unknown-start-tag"}
]}

View file

@ -1,9 +1,9 @@
require 'test/unit' require 'test/unit'
HTML5LIB_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__)))) HTML5_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
if File.exists?(File.join(HTML5LIB_BASE, 'testdata')) if File.exists?(File.join(HTML5_BASE, 'testdata'))
TESTDATA_DIR = File.join(HTML5LIB_BASE, 'testdata') TESTDATA_DIR = File.join(HTML5_BASE, 'testdata')
else else
TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata') TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
end end
@ -12,60 +12,15 @@ $:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
$:.unshift File.dirname(__FILE__) $:.unshift File.dirname(__FILE__)
def html5lib_test_files(subdirectory) def html5_test_files(subdirectory)
Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')] Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
end end
begin require 'rubygems'
require 'rubygems' require 'json'
require 'json'
rescue LoadError
class JSON
def self.parse json
json.gsub!(/"\s*:/, '"=>')
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
null = nil
eval json
end
end
end
module HTML5lib module HTML5
module TestSupport module TestSupport
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases # convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump) def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n") treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
@ -77,5 +32,39 @@ module HTML5lib
end end
end end
class TestData
include Enumerable
def initialize(filename, sections)
@f = open(filename)
@sections = sections
end
def each
data = {}
key=nil
@f.each_line do |line|
if line[0] == ?# and @sections.include?(line[1..-2])
heading = line[1..-2]
if data.any? and heading == @sections[0]
data[key].chomp! #Remove trailing newline
yield normaliseOutput(data)
data = {}
end
key = heading
data[key]=""
elsif key
data[key] += line
end
end
yield normaliseOutput(data) if data
end
def normaliseOutput(data)
#Remove trailing newlines
data.keys.each { |key| data[key].chomp! }
@sections.map {|heading| data[heading]}
end
end
end end
end end

View file

@ -1,8 +1,10 @@
require File.join(File.dirname(__FILE__), 'preamble') require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/inputstream' require 'html5/inputstream'
class Html5EncodingTestCase < Test::Unit::TestCase class Html5EncodingTestCase < Test::Unit::TestCase
include HTML5
include TestSupport
begin begin
require 'rubygems' require 'rubygems'
@ -10,23 +12,21 @@ class Html5EncodingTestCase < Test::Unit::TestCase
def test_chardet def test_chardet
file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r') file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r')
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true) stream = HTML5::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.char_encoding.downcase assert_equal 'big5', stream.char_encoding.downcase
rescue LoadError rescue LoadError
puts "chardet not found, skipping chardet tests" puts "chardet not found, skipping chardet tests"
end end
end end
html5lib_test_files('encoding').each do |test_file| html5_test_files('encoding').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '') test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
File.read(test_file).split("#data\n").each_with_index do |data, index| TestData.new(test_file, %w(data encoding)).
next if data.empty? each_with_index do |(input, encoding), index|
input, encoding = data.split(/\n#encoding\s+/, 2)
encoding = encoding.split[0]
define_method 'test_%s_%d' % [ test_name, index + 1 ] do define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false) stream = HTML5::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.char_encoding.downcase, input assert_equal encoding.downcase, stream.char_encoding.downcase, input
end end
end end

Some files were not shown because too many files have changed in this diff Show more