Merged with Jacques' latest changes.

This commit is contained in:
Jason Blevins 2007-09-03 09:14:51 -04:00
commit b96ff30026
111 changed files with 12210 additions and 3632 deletions

View file

@ -152,8 +152,7 @@ class ApplicationController < ActionController::Base
elsif %w(tex).include?(action_name)
response.headers['Content-Type'] = 'text/plain; charset=UTF-8'
elsif request.env['HTTP_USER_AGENT'] =~ /Validator/ or request.env.include?('HTTP_ACCEPT') &&
Mime::Type.parse(request.env["HTTP_ACCEPT"]).include?(Mime::XHTML) &&
!(request.env['HTTP_USER_AGENT'] =~ /Safari/ and %w(s5).include?(action_name))
Mime::Type.parse(request.env["HTTP_ACCEPT"]).include?(Mime::XHTML)
response.headers['Content-Type'] = 'application/xhtml+xml; charset=UTF-8'
elsif request.env['HTTP_USER_AGENT'] =~ /MathPlayer/
response.headers['Content-Type'] = 'application/xhtml+xml'

View file

@ -18,7 +18,7 @@ xml.feed('xmlns' => "http://www.w3.org/2005/Atom", "xml:lang" => 'en') do
xml.name(page.author)
end
if @hide_description
xml.summary('Content suppressed.', 'type' => 'text')
xml.summary("Updated by #{page.author} on #{page.updated_at.getgm.strftime("%Y-%m-%d")} at #{page.updated_at.getgm.strftime("%H:%M:%SZ")}.", 'type' => 'text')
else
xml.content('type' => 'xhtml', 'xml:base' => url_for(:only_path => false, :web => @web_name, :action => @link_action, :id => page.name) ) do
xml.div('xmlns' => 'http://www.w3.org/1999/xhtml' ) do

View file

@ -11,6 +11,16 @@
%----Macros----------
\newcommand{\gt}{>}
\newcommand{\lt}{<}
\newcommand{\darr}{\downarrow}
\newcommand{\nearr}{\nearrow}
\newcommand{\nwarr}{\nwarrow}
\newcommand{\searr}{\searrow}
\newcommand{\swarr}{\swarrow}
\newcommand{\iff}{\Longleftrightarrow}
\newcommand{\impliedby}{\Leftarrow}
\newcommand{\map}{\mapsto}
\newcommand{\embedsin}{\hookrightarrow}
\newcommand{\implies}{\Rightarrow}
\newcommand{\qed}{\blacksquare}
%-------------------------------------------------------------------

View file

@ -16,7 +16,7 @@ class Category < Chunk::Abstract
def initialize(match_data, content)
super(match_data, content)
@hidden = match_data[1]
@list = match_data[2].split(',').map { |c| c.strip }
@list = match_data[2].split(',').map { |c| html_escape(c.strip) }
@unmask_text = ''
if @hidden
@unmask_text = ''

View file

@ -74,6 +74,13 @@ module Chunk
@content.delete_chunk(self)
end
def html_escape(string)
string.gsub( /&/, "&amp;" ).
gsub( /</, "&lt;" ).
gsub( />/, "&gt;" ).
gsub( /"/, "&quot;" )
end
end
end

View file

@ -25,14 +25,14 @@
module Sanitize
require 'html5lib/html5parser'
require 'html5lib/liberalxmlparser'
require 'html5lib/treewalkers'
require 'html5lib/treebuilders'
require 'html5lib/serializer'
require 'html5lib/sanitizer'
require 'html5/html5parser'
require 'html5/liberalxmlparser'
require 'html5/treewalkers'
require 'html5/treebuilders'
require 'html5/serializer'
require 'html5/sanitizer'
include HTML5lib
include HTML5
# Sanitize a string, parsed using XHTML parsing rules.
#
@ -52,12 +52,12 @@ module Sanitize
options.each do |name, value|
next unless %w(encoding treebuilder to_tree).include? name.to_s
if name.to_s == 'treebuilder'
@treebuilder = HTML5lib::TreeBuilders.getTreeBuilder(value)
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
else
instance_variable_set("@#{name}", value)
end
end
parsed = XHTMLParser.parseFragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
parsed = XHTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder })
return parsed if @to_tree
return parsed.to_s
@ -81,12 +81,12 @@ module Sanitize
options.each do |name, value|
next unless %w(encoding treebuilder to_tree).include? name.to_s
if name.to_s == 'treebuilder'
@treebuilder = HTML5lib::TreeBuilders.getTreeBuilder(value)
@treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value)
else
instance_variable_set("@#{name}", value)
end
end
parsed = HTMLParser.parseFragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer,
:encoding => @encoding, :tree => @treebuilder })
return parsed if @to_tree
return parsed.to_s
@ -98,13 +98,9 @@ module Sanitize
# sanitize_rexml(tree) -> string
#
def sanitize_rexml(tree)
tokens = TreeWalkers.getTreeWalker('rexml').new(tree.to_ncr)
HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
:quote_attr_values => true,
:minimize_boolean_attributes => false,
:use_trailing_solidus => true,
tokens = TreeWalkers.get_tree_walker('rexml').new(tree.to_ncr)
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
:space_before_trailing_solidus => true,
:omit_optional_tags => false,
:inject_meta_charset => false,
:sanitize => true})
end

View file

@ -16,4 +16,4 @@ table.plaintable {
text-align:center;
margin-left:30px;
}
.noborder td, .noborder th {border:0}

View file

@ -1,6 +1,6 @@
/* Following are the presentation styles -- edit away! */
body {background: #FFF; color: #000; font-size: 2em;}
body {background: #FFF; color: #000; font-size: 1.6em;}
:link, :visited {text-decoration: none; color: #00C;}
#controls :active {color: #8A8 !important;}
#controls :focus {outline: 1px dotted #272;}

View file

@ -1,4 +1,5 @@
// S5 v1.2a1 slides.js -- released into the Public Domain
// S5 v1.2a2 slides.js -- released into the Public Domain
// Many modifications by Jacques Distler to allow operation as real XHTML.
//
// Please see http://www.meyerweb.com/eric/tools/s5/credits.html for information
// about all the wonderful and talented contributors to this code!
@ -30,6 +31,7 @@ var countdown = {
var isIE = navigator.appName == 'Microsoft Internet Explorer' && navigator.userAgent.indexOf('Opera') < 1 ? 1 : 0;
var isOp = navigator.userAgent.indexOf('Opera') > -1 ? 1 : 0;
var isSa = navigator.userAgent.indexOf('Safari') > -1 ? 1 : 0;
var isGe = navigator.userAgent.indexOf('Gecko') > -1 && navigator.userAgent.indexOf('Safari') < 1 ? 1 : 0;
function hasClass(object, className) {
@ -111,7 +113,14 @@ function slideLabel() {
for (var o = 0; o < menunodes.length; o++) {
otext += nodeValue(menunodes[o]);
}
list.options[list.length] = new Option(n + ' : ' + otext, n);
if (isSa) {
var option = createElement('option');
option.setAttribute('value', n);
option.appendChild(document.createTextNode(n + ' : ' + otext) );
list.appendChild(option);
} else {
list.options[list.length] = new Option(n + ' : ' + otext, n);
}
}
}
@ -122,12 +131,12 @@ function currentSlide() {
} else {
cs = document.currentSlide;
}
var plink = document.createElement('a');
var plink = createElement('a');
plink.id = 'plink';
plink.setAttribute('href', '');
var csHere = document.createElement('span');
var csSep = document.createElement('span');
var csTotal = document.createElement('span');
var csHere = createElement('span');
var csSep = createElement('span');
var csTotal = createElement('span');
csHere.id = 'csHere';
csSep.id = 'csSep';
csTotal.id = 'csTotal';
@ -376,7 +385,7 @@ function slideJump() {
function fixLinks() {
var thisUri = window.location.href;
thisUri = thisUri.slice(0, thisUri.length - window.location.hash.length);
var aelements = document.getElementsByTagName('A');
var aelements = document.getElementsByTagName('a');
for (var i = 0; i < aelements.length; i++) {
var a = aelements[i].href;
var slideID = a.match('\#slide[0-9]{1,2}');
@ -418,43 +427,43 @@ function permaLink() {
function createControls() {
var controlsDiv = document.getElementById("controls");
if (!controlsDiv) return;
var controlForm = document.createElement('form');
var controlForm = createElement('form');
controlForm.id = 'controlForm';
controlForm.setAttribute('action', '#');
if (controlVis == 'hidden') {
controlForm.setAttribute('onmouseover', 'showHide(\'s\');');
controlForm.setAttribute('onmouseout', 'showHide(\'h\');');
}
var navLinks = document.createElement('div');
var navLinks = createElement('div');
navLinks.id = 'navLinks';
var showNotes = document.createElement('a');
var showNotes = createElement('a');
showNotes.id = 'show-notes';
showNotes.setAttribute('accesskey', 'n');
showNotes.setAttribute('href', 'javascript:createNotesWindow();');
showNotes.setAttribute('title', 'Show Notes');
showNotes.appendChild(document.createTextNode('\u2261'));
var toggle = document.createElement('a');
var toggle = createElement('a');
toggle.id = 'toggle';
toggle.setAttribute('accesskey', 't');
toggle.setAttribute('href', 'javascript:toggle();');
toggle.appendChild(document.createTextNode('\u00D8'));
var prev = document.createElement('a');
var prev = createElement('a');
prev.id = 'prev';
prev.setAttribute('accesskey', 'z');
prev.setAttribute('href', 'javascript:go(-1);');
prev.appendChild(document.createTextNode('\u00AB'));
var next = document.createElement('a');
var next = createElement('a');
next.id = 'next';
next.setAttribute('accesskey', 'x');
next.setAttribute('href', 'javascript:go(1);');
next.appendChild(document.createTextNode('\u00BB'));
var navList = document.createElement('div');
var navList = createElement('div');
navList.id = 'navList';
if (controlVis != 'hidden') {
navList.setAttribute('onmouseover', 'showHide(\'s\');');
navList.setAttribute('onmouseout', 'showHide(\'h\');');
}
var jumplist = document.createElement('select');
var jumplist = createElement('select');
jumplist.id = 'jumplist';
jumplist.setAttribute('onchange', 'go(\'j\');');
navList.appendChild(jumplist);
@ -503,7 +512,7 @@ function fontScale() { // causes layout problems in FireFox that get fixed if b
function fontSize(value) {
if (!(s5ss = document.getElementById('s5ss'))) {
if (!document.createStyleSheet) {
document.getElementsByTagName('head')[0].appendChild(s5ss = document.createElement('style'));
document.getElementsByTagName('head')[0].appendChild(s5ss = createElement('style'));
s5ss.setAttribute('media','screen, projection');
s5ss.setAttribute('id','s5ss');
} else {
@ -784,6 +793,14 @@ function readTime(val) {
}
}
function createElement(element) {
if (typeof document.createElementNS != 'undefined') {
return document.createElementNS('http://www.w3.org/1999/xhtml', element);
} else {
return document.createElement(element);
}
}
function windowChange() {
fontScale();
}

View file

@ -0,0 +1,64 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f.dtd" >
<!-- Do not edit this document! The system will likely break if you do. -->
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Notes</title>
<link rel="stylesheet" href="default/notes.css" type="text/css" />
<script type="text/javascript">
// <![CDATA[
document.onkeyup = opener.keys;
document.onkeypress = opener.trap;
document.onclick = opener.clicker;
// ]]>
</script>
</head>
<body onload="opener.s5NotesWindowLoaded=true;" onunload="opener.s5NotesWindowLoaded=false;">
<div class="timers" id="elapsed">
<h1>
<a href="#" onclick="opener.minimizeTimer('elapsed'); return false;">Elapsed Time</a>
</h1>
<ul>
<li>
<h2>Presentation</h2>
<span class="clock" id="elapsed-presentation">00:00:00</span>
</li>
<li>
<h2>Current Slide</h2>
<span class="clock" id="elapsed-slide">00:00:00</span>
</li>
</ul>
<div class="controls">
<a href="#reset-elapsed" onclick="opener.resetElapsedTime(); return false;" title="Reset Elapsed Time">|&larr;</a>
</div>
</div>
<div class="timers" id="remaining">
<h1>
<a href="#" onclick="opener.minimizeTimer('remaining'); return false;">Remaining Time</a>
</h1>
<p>
<a href="#subtract-remaining" class="control" id="minus" onclick="opener.alterRemainingTime('-5'); return false;" title="Subtract 5 Minutes">-</a>
<span class="clock" id="timeLeft">00:00:00</span>
<a href="#add-remaining" class="control" id="plus" onclick="opener.alterRemainingTime('5'); return false;" title="Add 5 Minutes">+</a>
</p>
<div class="controls">
<form action="#" onsubmit="opener.resetRemainingTime(); return false;">
<input type="text" class="text" id="startFrom" value="0" size="4" maxlength="4" />
<a href="#toggle-remaining" onclick="opener.toggleRemainingTime(); return false;" title="Pause/Run Remaining Time">||</a>
<a href="#reset-remaining" onclick="opener.resetRemainingTime(); return false;" title="Reset Remaining Time">|&larr;</a>
</form>
</div>
</div>
<h2 id="slide">...</h2>
<div id="notes"></div>
<h2 id="next">...</h2>
<div id="nextnotes"></div>
</body>
</html>

5
vendor/plugins/HTML5lib/History.txt vendored Normal file
View file

@ -0,0 +1,5 @@
== 0.1.0 / 2007-08-07
* 1 major enhancement
* Birthday!

59
vendor/plugins/HTML5lib/Manifest.txt vendored Normal file
View file

@ -0,0 +1,59 @@
History.txt
Manifest.txt
README
Rakefile.rb
lib/html5.rb
lib/html5/constants.rb
lib/html5/filters/base.rb
lib/html5/filters/inject_meta_charset.rb
lib/html5/filters/optionaltags.rb
lib/html5/filters/sanitizer.rb
lib/html5/filters/whitespace.rb
lib/html5/html5parser.rb
lib/html5/html5parser/after_body_phase.rb
lib/html5/html5parser/after_frameset_phase.rb
lib/html5/html5parser/after_head_phase.rb
lib/html5/html5parser/before_head_phase.rb
lib/html5/html5parser/in_body_phase.rb
lib/html5/html5parser/in_caption_phase.rb
lib/html5/html5parser/in_cell_phase.rb
lib/html5/html5parser/in_column_group_phase.rb
lib/html5/html5parser/in_frameset_phase.rb
lib/html5/html5parser/in_head_phase.rb
lib/html5/html5parser/in_row_phase.rb
lib/html5/html5parser/in_select_phase.rb
lib/html5/html5parser/in_table_body_phase.rb
lib/html5/html5parser/in_table_phase.rb
lib/html5/html5parser/initial_phase.rb
lib/html5/html5parser/phase.rb
lib/html5/html5parser/root_element_phase.rb
lib/html5/html5parser/trailing_end_phase.rb
lib/html5/inputstream.rb
lib/html5/liberalxmlparser.rb
lib/html5/sanitizer.rb
lib/html5/serializer.rb
lib/html5/serializer/htmlserializer.rb
lib/html5/serializer/xhtmlserializer.rb
lib/html5/tokenizer.rb
lib/html5/treebuilders.rb
lib/html5/treebuilders/base.rb
lib/html5/treebuilders/hpricot.rb
lib/html5/treebuilders/rexml.rb
lib/html5/treebuilders/simpletree.rb
lib/html5/treewalkers.rb
lib/html5/treewalkers/base.rb
lib/html5/treewalkers/hpricot.rb
lib/html5/treewalkers/rexml.rb
lib/html5/treewalkers/simpletree.rb
lib/html5/version.rb
parse.rb
tests/preamble.rb
tests/test_encoding.rb
tests/test_lxp.rb
tests/test_parser.rb
tests/test_sanitizer.rb
tests/test_serializer.rb
tests/test_stream.rb
tests/test_tokenizer.rb
tests/test_treewalkers.rb
tests/tokenizer_test_parser.rb

View file

@ -1,9 +1,45 @@
= HTML5lib
html5
by Ryan King, et al
http://code.google.com/p/html5lib
== Basic Usage
== DESCRIPTION:
require 'html5lib'
A ruby implementation of the parsing algorithm in HTML5.
doc = HTML5lib.parse('<html>...</html>')
doc.class # REXML::Document
== FEATURES/PROBLEMS:
== SYNOPSIS:
TODO
== REQUIREMENTS:
* chardet, only tested with 0.9.0
== INSTALL:
* sudo gem install html5
== LICENSE:
Copyright (c) 2006-2007 The Authors
Contributers:
James Graham - jg307@cam.ac.uk
Anne van Kesteren - annevankesteren@gmail.com
Lachlan Hunt - lachlan.hunt@lachy.id.au
Matt McDonald - kanashii@kanashii.ca
Sam Ruby - rubys@intertwingly.net
Ian Hickson (Google) - ian@hixie.ch
Thomas Broyer - t.broyer@ltgt.net
Jacques Distler - distler@golem.ph.utexas.edu
Ryan King - ryan@theryanking.com
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View file

@ -1,7 +1,33 @@
require 'rake'
require 'rake/testtask'
require 'hoe'
require 'lib/html5/version'
Rake::TestTask.new do |task|
task.pattern = 'tests/test_*.rb'
task.verbose = true
Hoe.new("html5", HTML5::VERSION) do |p|
p.name = "html5"
p.description = p.paragraphs_of('README', 2..5).join("\n\n")
p.summary = "HTML5 parser/tokenizer."
p.author = ['Ryan King'] # TODO: add more names
p.email = 'ryan@theryanking.com'
p.url = 'http://code.google.com/p/html5lib'
p.need_zip = true
p.extra_deps << ['chardet', '>= 0.9.0']
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
end
require 'rcov/rcovtask'
namespace :test do
namespace :coverage do
desc "Delete aggregate coverage data."
task(:clean) { rm_f "coverage.data" }
end
desc 'Aggregate code coverage for unit, functional and integration tests'
Rcov::RcovTask.new(:coverage => "test:coverage:clean") do |t|
t.libs << "tests"
t.test_files = FileList["tests/test_*.rb"]
t.output_dir = "tests/coverage/"
t.verbose = true
end
end

215
vendor/plugins/HTML5lib/bin/html5 vendored Executable file
View file

@ -0,0 +1,215 @@
#!/usr/bin/env ruby
$:.unshift File.dirname(__FILE__), 'lib'
def parse(opts, args)
encoding = nil
f = args[-1]
if f
begin
if f[0..6] == 'http://'
require 'open-uri'
f = URI.parse(f).open
encoding = f.charset
elsif f == '-'
f = $stdin
else
f = open(f)
end
rescue
end
else
$stderr.write("No filename provided. Use -h for help\n")
exit(1)
end
require 'html5/treebuilders'
treebuilder = HTML5::TreeBuilders[opts.treebuilder]
if opts.output == :xml
require 'html5/liberalxmlparser'
p = HTML5::XMLParser.new(:tree=>treebuilder)
else
require 'html5/html5parser'
p = HTML5::HTMLParser.new(:tree=>treebuilder)
end
if opts.parsemethod == :parse
args = [f, encoding]
else
args = [f, 'div', encoding]
end
if opts.profile
require 'profiler'
Profiler__::start_profile
p.send(opts.parsemethod, *args)
Profiler__::stop_profile
Profiler__::print_profile($stderr)
elsif opts.time
require 'time' # TODO: switch to benchmark
t0 = Time.new
document = p.send(opts.parsemethod, *args)
t1 = Time.new
print_output(p, document, opts)
t2 = Time.new
puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
else
document = p.send(opts.parsemethod, *args)
print_output(p, document, opts)
end
end
def print_output(parser, document, opts)
puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
case opts.output
when :xml
print document
when :html
require 'html5/treewalkers'
tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
require 'html5/serializer'
puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
when :hilite
print document.hilite
when :tree
document = [document] unless document.respond_to?(:each)
document.each {|fragment| puts parser.tree.testSerializer(fragment)}
end
if opts.error
errList=[]
for pos, message in parser.errors
errList << ("Line %i Col %i"%pos + " " + message)
end
$stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
end
end
require 'ostruct'
options = OpenStruct.new
options.profile = false
options.time = false
options.output = :html
options.treebuilder = 'simpletree'
options.error = false
options.encoding = false
options.parsemethod = :parse
options.serializer = {
:encoding => 'utf-8',
:omit_optional_tags => false,
:inject_meta_charset => false
}
require 'optparse'
opts = OptionParser.new do |opts|
opts.separator ""
opts.separator "Parse Options:"
opts.on("-b", "--treebuilder NAME") do |treebuilder|
options.treebuilder = treebuilder
end
opts.on("-f", "--fragment", "Parse as a fragment") do |parse|
options.parsemethod = :parse_fragment
end
opts.separator ""
opts.separator "Filter Options:"
opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
options.serializer[:inject_meta_charset] = inject
end
opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
options.serializer[:strip_whitespace] = strip
end
opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
options.serializer[:sanitize] = sanitize
end
opts.separator ""
opts.separator "Output Options:"
opts.on("--tree", "output as debug tree") do |tree|
options.output = :tree
end
opts.on("-x", "--xml", "output as xml") do |xml|
options.output = :xml
options.treebuilder = "rexml"
end
opts.on("--[no-]html", "Output as html") do |html|
options.output = (html ? :html : nil)
end
opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
options.output = :hilite
end
opts.on("-e", "--error", "Print a list of parse errors") do |error|
options.error = error
end
opts.separator ""
opts.separator "Serialization Options:"
opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
options.serializer[:omit_optional_tags] = omit
end
opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
options.serializer[:quote_attr_values] = quote
end
opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
options.serializer[:use_best_quote_char] = best
end
opts.on("--quote-char C", "Use specified quote character") do |c|
options.serializer[:quote_char] = c
end
opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
options.serializer[:minimize_boolean_attributes] = min
end
opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
options.serializer[:use_trailing_solidus] = slash
end
opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
options.serializer[:escape_lt_in_attrs] = lt
end
opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
options.serializer[:escape_rcdata] = rcdata
end
opts.separator ""
opts.separator "Other Options:"
opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
options.profile = profile
end
opts.on("-t", "--[no-]time", "Time the run") do |time|
options.time = time
end
opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
options.encoding = encoding
end
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
end
opts.parse!(ARGV)
parse options, ARGV

13
vendor/plugins/HTML5lib/lib/html5.rb vendored Normal file
View file

@ -0,0 +1,13 @@
require 'html5/html5parser'
require 'html5/version'
module HTML5
def self.parse(stream, options={})
HTMLParser.parse(stream, options)
end
def self.parse_fragment(stream, options={})
HTMLParser.parse(stream, options)
end
end

818
vendor/plugins/HTML5lib/lib/html5/constants.rb vendored Executable file
View file

@ -0,0 +1,818 @@
module HTML5
class EOF < Exception; end
CONTENT_MODEL_FLAGS = [
:PCDATA,
:RCDATA,
:CDATA,
:PLAINTEXT
]
SCOPING_ELEMENTS = %w[
button
caption
html
marquee
object
table
td
th
]
FORMATTING_ELEMENTS = %w[
a
b
big
em
font
i
nobr
s
small
strike
strong
tt
u
]
SPECIAL_ELEMENTS = %w[
address
area
base
basefont
bgsound
blockquote
body
br
center
col
colgroup
dd
dir
div
dl
dt
embed
fieldset
form
frame
frameset
h1
h2
h3
h4
h5
h6
head
hr
iframe
image
img
input
isindex
li
link
listing
menu
meta
noembed
noframes
noscript
ol
optgroup
option
p
param
plaintext
pre
script
select
spacer
style
tbody
textarea
tfoot
thead
title
tr
ul
wbr
]
SPACE_CHARACTERS = %W[
\t
\n
\x0B
\x0C
\x20
\r
]
TABLE_INSERT_MODE_ELEMENTS = %w[
table
tbody
tfoot
thead
tr
]
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
DIGITS = '0'..'9'
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
# Heading elements need to be ordered
HEADING_ELEMENTS = %w[
h1
h2
h3
h4
h5
h6
]
# XXX What about event-source and command?
VOID_ELEMENTS = %w[
base
link
meta
hr
br
img
embed
param
area
col
input
]
CDATA_ELEMENTS = %w[title textarea]
RCDATA_ELEMENTS = %w[
style
script
xmp
iframe
noembed
noframes
noscript
]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],
'img' => %w[ismap],
'audio' => %w[autoplay controls],
'video' => %w[autoplay controls],
'script' => %w[defer async],
'details' => %w[open],
'datagrid' => %w[multiple disabled],
'command' => %w[hidden disabled checked default],
'menu' => %w[autosubmit],
'fieldset' => %w[disabled readonly],
'option' => %w[disabled readonly selected],
'optgroup' => %w[disabled readonly],
'button' => %w[disabled autofocus],
'input' => %w[disabled readonly required autofocus checked ismap],
'select' => %w[disabled readonly autofocus multiple],
'output' => %w[disabled readonly]
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN
65533, # 0x81 UNDEFINED
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
8224, # 0x86 0x2020 DAGGER
8225, # 0x87 0x2021 DOUBLE DAGGER
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
8240, # 0x89 0x2030 PER MILLE SIGN
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
65533, # 0x8D UNDEFINED
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
65533, # 0x8F UNDEFINED
65533, # 0x90 UNDEFINED
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
8226, # 0x95 0x2022 BULLET
8211, # 0x96 0x2013 EN DASH
8212, # 0x97 0x2014 EM DASH
732, # 0x98 0x02DC SMALL TILDE
8482, # 0x99 0x2122 TRADE MARK SIGN
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
65533, # 0x9D UNDEFINED
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
]
# ENTITIES was generated from Python using the following code:
#
# import constants
# entities = constants.entities.items()
# entities.sort()
# list = [ ' '.join([repr(entity), '=>', ord(value)<128 and
# repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
# for entity, value in entities]
# print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }'
ENTITIES = {
'AElig' => "\xc3\x86",
'AElig;' => "\xc3\x86",
'AMP' => '&',
'AMP;' => '&',
'Aacute' => "\xc3\x81",
'Aacute;' => "\xc3\x81",
'Acirc' => "\xc3\x82",
'Acirc;' => "\xc3\x82",
'Agrave' => "\xc3\x80",
'Agrave;' => "\xc3\x80",
'Alpha;' => "\xce\x91",
'Aring' => "\xc3\x85",
'Aring;' => "\xc3\x85",
'Atilde' => "\xc3\x83",
'Atilde;' => "\xc3\x83",
'Auml' => "\xc3\x84",
'Auml;' => "\xc3\x84",
'Beta;' => "\xce\x92",
'COPY' => "\xc2\xa9",
'COPY;' => "\xc2\xa9",
'Ccedil' => "\xc3\x87",
'Ccedil;' => "\xc3\x87",
'Chi;' => "\xce\xa7",
'Dagger;' => "\xe2\x80\xa1",
'Delta;' => "\xce\x94",
'ETH' => "\xc3\x90",
'ETH;' => "\xc3\x90",
'Eacute' => "\xc3\x89",
'Eacute;' => "\xc3\x89",
'Ecirc' => "\xc3\x8a",
'Ecirc;' => "\xc3\x8a",
'Egrave' => "\xc3\x88",
'Egrave;' => "\xc3\x88",
'Epsilon;' => "\xce\x95",
'Eta;' => "\xce\x97",
'Euml' => "\xc3\x8b",
'Euml;' => "\xc3\x8b",
'GT' => '>',
'GT;' => '>',
'Gamma;' => "\xce\x93",
'Iacute' => "\xc3\x8d",
'Iacute;' => "\xc3\x8d",
'Icirc' => "\xc3\x8e",
'Icirc;' => "\xc3\x8e",
'Igrave' => "\xc3\x8c",
'Igrave;' => "\xc3\x8c",
'Iota;' => "\xce\x99",
'Iuml' => "\xc3\x8f",
'Iuml;' => "\xc3\x8f",
'Kappa;' => "\xce\x9a",
'LT' => '<',
'LT;' => '<',
'Lambda;' => "\xce\x9b",
'Mu;' => "\xce\x9c",
'Ntilde' => "\xc3\x91",
'Ntilde;' => "\xc3\x91",
'Nu;' => "\xce\x9d",
'OElig;' => "\xc5\x92",
'Oacute' => "\xc3\x93",
'Oacute;' => "\xc3\x93",
'Ocirc' => "\xc3\x94",
'Ocirc;' => "\xc3\x94",
'Ograve' => "\xc3\x92",
'Ograve;' => "\xc3\x92",
'Omega;' => "\xce\xa9",
'Omicron;' => "\xce\x9f",
'Oslash' => "\xc3\x98",
'Oslash;' => "\xc3\x98",
'Otilde' => "\xc3\x95",
'Otilde;' => "\xc3\x95",
'Ouml' => "\xc3\x96",
'Ouml;' => "\xc3\x96",
'Phi;' => "\xce\xa6",
'Pi;' => "\xce\xa0",
'Prime;' => "\xe2\x80\xb3",
'Psi;' => "\xce\xa8",
'QUOT' => '"',
'QUOT;' => '"',
'REG' => "\xc2\xae",
'REG;' => "\xc2\xae",
'Rho;' => "\xce\xa1",
'Scaron;' => "\xc5\xa0",
'Sigma;' => "\xce\xa3",
'THORN' => "\xc3\x9e",
'THORN;' => "\xc3\x9e",
'TRADE;' => "\xe2\x84\xa2",
'Tau;' => "\xce\xa4",
'Theta;' => "\xce\x98",
'Uacute' => "\xc3\x9a",
'Uacute;' => "\xc3\x9a",
'Ucirc' => "\xc3\x9b",
'Ucirc;' => "\xc3\x9b",
'Ugrave' => "\xc3\x99",
'Ugrave;' => "\xc3\x99",
'Upsilon;' => "\xce\xa5",
'Uuml' => "\xc3\x9c",
'Uuml;' => "\xc3\x9c",
'Xi;' => "\xce\x9e",
'Yacute' => "\xc3\x9d",
'Yacute;' => "\xc3\x9d",
'Yuml;' => "\xc5\xb8",
'Zeta;' => "\xce\x96",
'aacute' => "\xc3\xa1",
'aacute;' => "\xc3\xa1",
'acirc' => "\xc3\xa2",
'acirc;' => "\xc3\xa2",
'acute' => "\xc2\xb4",
'acute;' => "\xc2\xb4",
'aelig' => "\xc3\xa6",
'aelig;' => "\xc3\xa6",
'agrave' => "\xc3\xa0",
'agrave;' => "\xc3\xa0",
'alefsym;' => "\xe2\x84\xb5",
'alpha;' => "\xce\xb1",
'amp' => '&',
'amp;' => '&',
'and;' => "\xe2\x88\xa7",
'ang;' => "\xe2\x88\xa0",
'apos;' => "'",
'aring' => "\xc3\xa5",
'aring;' => "\xc3\xa5",
'asymp;' => "\xe2\x89\x88",
'atilde' => "\xc3\xa3",
'atilde;' => "\xc3\xa3",
'auml' => "\xc3\xa4",
'auml;' => "\xc3\xa4",
'bdquo;' => "\xe2\x80\x9e",
'beta;' => "\xce\xb2",
'brvbar' => "\xc2\xa6",
'brvbar;' => "\xc2\xa6",
'bull;' => "\xe2\x80\xa2",
'cap;' => "\xe2\x88\xa9",
'ccedil' => "\xc3\xa7",
'ccedil;' => "\xc3\xa7",
'cedil' => "\xc2\xb8",
'cedil;' => "\xc2\xb8",
'cent' => "\xc2\xa2",
'cent;' => "\xc2\xa2",
'chi;' => "\xcf\x87",
'circ;' => "\xcb\x86",
'clubs;' => "\xe2\x99\xa3",
'cong;' => "\xe2\x89\x85",
'copy' => "\xc2\xa9",
'copy;' => "\xc2\xa9",
'crarr;' => "\xe2\x86\xb5",
'cup;' => "\xe2\x88\xaa",
'curren' => "\xc2\xa4",
'curren;' => "\xc2\xa4",
'dArr;' => "\xe2\x87\x93",
'dagger;' => "\xe2\x80\xa0",
'darr;' => "\xe2\x86\x93",
'deg' => "\xc2\xb0",
'deg;' => "\xc2\xb0",
'delta;' => "\xce\xb4",
'diams;' => "\xe2\x99\xa6",
'divide' => "\xc3\xb7",
'divide;' => "\xc3\xb7",
'eacute' => "\xc3\xa9",
'eacute;' => "\xc3\xa9",
'ecirc' => "\xc3\xaa",
'ecirc;' => "\xc3\xaa",
'egrave' => "\xc3\xa8",
'egrave;' => "\xc3\xa8",
'empty;' => "\xe2\x88\x85",
'emsp;' => "\xe2\x80\x83",
'ensp;' => "\xe2\x80\x82",
'epsilon;' => "\xce\xb5",
'equiv;' => "\xe2\x89\xa1",
'eta;' => "\xce\xb7",
'eth' => "\xc3\xb0",
'eth;' => "\xc3\xb0",
'euml' => "\xc3\xab",
'euml;' => "\xc3\xab",
'euro;' => "\xe2\x82\xac",
'exist;' => "\xe2\x88\x83",
'fnof;' => "\xc6\x92",
'forall;' => "\xe2\x88\x80",
'frac12' => "\xc2\xbd",
'frac12;' => "\xc2\xbd",
'frac14' => "\xc2\xbc",
'frac14;' => "\xc2\xbc",
'frac34' => "\xc2\xbe",
'frac34;' => "\xc2\xbe",
'frasl;' => "\xe2\x81\x84",
'gamma;' => "\xce\xb3",
'ge;' => "\xe2\x89\xa5",
'gt' => '>',
'gt;' => '>',
'hArr;' => "\xe2\x87\x94",
'harr;' => "\xe2\x86\x94",
'hearts;' => "\xe2\x99\xa5",
'hellip;' => "\xe2\x80\xa6",
'iacute' => "\xc3\xad",
'iacute;' => "\xc3\xad",
'icirc' => "\xc3\xae",
'icirc;' => "\xc3\xae",
'iexcl' => "\xc2\xa1",
'iexcl;' => "\xc2\xa1",
'igrave' => "\xc3\xac",
'igrave;' => "\xc3\xac",
'image;' => "\xe2\x84\x91",
'infin;' => "\xe2\x88\x9e",
'int;' => "\xe2\x88\xab",
'iota;' => "\xce\xb9",
'iquest' => "\xc2\xbf",
'iquest;' => "\xc2\xbf",
'isin;' => "\xe2\x88\x88",
'iuml' => "\xc3\xaf",
'iuml;' => "\xc3\xaf",
'kappa;' => "\xce\xba",
'lArr;' => "\xe2\x87\x90",
'lambda;' => "\xce\xbb",
'lang;' => "\xe3\x80\x88",
'laquo' => "\xc2\xab",
'laquo;' => "\xc2\xab",
'larr;' => "\xe2\x86\x90",
'lceil;' => "\xe2\x8c\x88",
'ldquo;' => "\xe2\x80\x9c",
'le;' => "\xe2\x89\xa4",
'lfloor;' => "\xe2\x8c\x8a",
'lowast;' => "\xe2\x88\x97",
'loz;' => "\xe2\x97\x8a",
'lrm;' => "\xe2\x80\x8e",
'lsaquo;' => "\xe2\x80\xb9",
'lsquo;' => "\xe2\x80\x98",
'lt' => '<',
'lt;' => '<',
'macr' => "\xc2\xaf",
'macr;' => "\xc2\xaf",
'mdash;' => "\xe2\x80\x94",
'micro' => "\xc2\xb5",
'micro;' => "\xc2\xb5",
'middot' => "\xc2\xb7",
'middot;' => "\xc2\xb7",
'minus;' => "\xe2\x88\x92",
'mu;' => "\xce\xbc",
'nabla;' => "\xe2\x88\x87",
'nbsp' => "\xc2\xa0",
'nbsp;' => "\xc2\xa0",
'ndash;' => "\xe2\x80\x93",
'ne;' => "\xe2\x89\xa0",
'ni;' => "\xe2\x88\x8b",
'not' => "\xc2\xac",
'not;' => "\xc2\xac",
'notin;' => "\xe2\x88\x89",
'nsub;' => "\xe2\x8a\x84",
'ntilde' => "\xc3\xb1",
'ntilde;' => "\xc3\xb1",
'nu;' => "\xce\xbd",
'oacute' => "\xc3\xb3",
'oacute;' => "\xc3\xb3",
'ocirc' => "\xc3\xb4",
'ocirc;' => "\xc3\xb4",
'oelig;' => "\xc5\x93",
'ograve' => "\xc3\xb2",
'ograve;' => "\xc3\xb2",
'oline;' => "\xe2\x80\xbe",
'omega;' => "\xcf\x89",
'omicron;' => "\xce\xbf",
'oplus;' => "\xe2\x8a\x95",
'or;' => "\xe2\x88\xa8",
'ordf' => "\xc2\xaa",
'ordf;' => "\xc2\xaa",
'ordm' => "\xc2\xba",
'ordm;' => "\xc2\xba",
'oslash' => "\xc3\xb8",
'oslash;' => "\xc3\xb8",
'otilde' => "\xc3\xb5",
'otilde;' => "\xc3\xb5",
'otimes;' => "\xe2\x8a\x97",
'ouml' => "\xc3\xb6",
'ouml;' => "\xc3\xb6",
'para' => "\xc2\xb6",
'para;' => "\xc2\xb6",
'part;' => "\xe2\x88\x82",
'permil;' => "\xe2\x80\xb0",
'perp;' => "\xe2\x8a\xa5",
'phi;' => "\xcf\x86",
'pi;' => "\xcf\x80",
'piv;' => "\xcf\x96",
'plusmn' => "\xc2\xb1",
'plusmn;' => "\xc2\xb1",
'pound' => "\xc2\xa3",
'pound;' => "\xc2\xa3",
'prime;' => "\xe2\x80\xb2",
'prod;' => "\xe2\x88\x8f",
'prop;' => "\xe2\x88\x9d",
'psi;' => "\xcf\x88",
'quot' => '"',
'quot;' => '"',
'rArr;' => "\xe2\x87\x92",
'radic;' => "\xe2\x88\x9a",
'rang;' => "\xe3\x80\x89",
'raquo' => "\xc2\xbb",
'raquo;' => "\xc2\xbb",
'rarr;' => "\xe2\x86\x92",
'rceil;' => "\xe2\x8c\x89",
'rdquo;' => "\xe2\x80\x9d",
'real;' => "\xe2\x84\x9c",
'reg' => "\xc2\xae",
'reg;' => "\xc2\xae",
'rfloor;' => "\xe2\x8c\x8b",
'rho;' => "\xcf\x81",
'rlm;' => "\xe2\x80\x8f",
'rsaquo;' => "\xe2\x80\xba",
'rsquo;' => "\xe2\x80\x99",
'sbquo;' => "\xe2\x80\x9a",
'scaron;' => "\xc5\xa1",
'sdot;' => "\xe2\x8b\x85",
'sect' => "\xc2\xa7",
'sect;' => "\xc2\xa7",
'shy' => "\xc2\xad",
'shy;' => "\xc2\xad",
'sigma;' => "\xcf\x83",
'sigmaf;' => "\xcf\x82",
'sim;' => "\xe2\x88\xbc",
'spades;' => "\xe2\x99\xa0",
'sub;' => "\xe2\x8a\x82",
'sube;' => "\xe2\x8a\x86",
'sum;' => "\xe2\x88\x91",
'sup1' => "\xc2\xb9",
'sup1;' => "\xc2\xb9",
'sup2' => "\xc2\xb2",
'sup2;' => "\xc2\xb2",
'sup3' => "\xc2\xb3",
'sup3;' => "\xc2\xb3",
'sup;' => "\xe2\x8a\x83",
'supe;' => "\xe2\x8a\x87",
'szlig' => "\xc3\x9f",
'szlig;' => "\xc3\x9f",
'tau;' => "\xcf\x84",
'there4;' => "\xe2\x88\xb4",
'theta;' => "\xce\xb8",
'thetasym;' => "\xcf\x91",
'thinsp;' => "\xe2\x80\x89",
'thorn' => "\xc3\xbe",
'thorn;' => "\xc3\xbe",
'tilde;' => "\xcb\x9c",
'times' => "\xc3\x97",
'times;' => "\xc3\x97",
'trade;' => "\xe2\x84\xa2",
'uArr;' => "\xe2\x87\x91",
'uacute' => "\xc3\xba",
'uacute;' => "\xc3\xba",
'uarr;' => "\xe2\x86\x91",
'ucirc' => "\xc3\xbb",
'ucirc;' => "\xc3\xbb",
'ugrave' => "\xc3\xb9",
'ugrave;' => "\xc3\xb9",
'uml' => "\xc2\xa8",
'uml;' => "\xc2\xa8",
'upsih;' => "\xcf\x92",
'upsilon;' => "\xcf\x85",
'uuml' => "\xc3\xbc",
'uuml;' => "\xc3\xbc",
'weierp;' => "\xe2\x84\x98",
'xi;' => "\xce\xbe",
'yacute' => "\xc3\xbd",
'yacute;' => "\xc3\xbd",
'yen' => "\xc2\xa5",
'yen;' => "\xc2\xa5",
'yuml' => "\xc3\xbf",
'yuml;' => "\xc3\xbf",
'zeta;' => "\xce\xb6",
'zwj;' => "\xe2\x80\x8d",
'zwnj;' => "\xe2\x80\x8c"
}
ENCODINGS = %w[
ansi_x3.4-1968
iso-ir-6
ansi_x3.4-1986
iso_646.irv:1991
ascii
iso646-us
us-ascii
us
ibm367
cp367
csascii
ks_c_5601-1987
korean
iso-2022-kr
csiso2022kr
euc-kr
iso-2022-jp
csiso2022jp
iso-2022-jp-2
iso-ir-58
chinese
csiso58gb231280
iso_8859-1:1987
iso-ir-100
iso_8859-1
iso-8859-1
latin1
l1
ibm819
cp819
csisolatin1
iso_8859-2:1987
iso-ir-101
iso_8859-2
iso-8859-2
latin2
l2
csisolatin2
iso_8859-3:1988
iso-ir-109
iso_8859-3
iso-8859-3
latin3
l3
csisolatin3
iso_8859-4:1988
iso-ir-110
iso_8859-4
iso-8859-4
latin4
l4
csisolatin4
iso_8859-6:1987
iso-ir-127
iso_8859-6
iso-8859-6
ecma-114
asmo-708
arabic
csisolatinarabic
iso_8859-7:1987
iso-ir-126
iso_8859-7
iso-8859-7
elot_928
ecma-118
greek
greek8
csisolatingreek
iso_8859-8:1988
iso-ir-138
iso_8859-8
iso-8859-8
hebrew
csisolatinhebrew
iso_8859-5:1988
iso-ir-144
iso_8859-5
iso-8859-5
cyrillic
csisolatincyrillic
iso_8859-9:1989
iso-ir-148
iso_8859-9
iso-8859-9
latin5
l5
csisolatin5
iso-8859-10
iso-ir-157
l6
iso_8859-10:1992
csisolatin6
latin6
hp-roman8
roman8
r8
ibm037
cp037
csibm037
ibm424
cp424
csibm424
ibm437
cp437
437
cspc8codepage437
ibm500
cp500
csibm500
ibm775
cp775
cspc775baltic
ibm850
cp850
850
cspc850multilingual
ibm852
cp852
852
cspcp852
ibm855
cp855
855
csibm855
ibm857
cp857
857
csibm857
ibm860
cp860
860
csibm860
ibm861
cp861
861
cp-is
csibm861
ibm862
cp862
862
cspc862latinhebrew
ibm863
cp863
863
csibm863
ibm864
cp864
csibm864
ibm865
cp865
865
csibm865
ibm866
cp866
866
csibm866
ibm869
cp869
869
cp-gr
csibm869
ibm1026
cp1026
csibm1026
koi8-r
cskoi8r
koi8-u
big5-hkscs
ptcp154
csptcp154
pt154
cp154
utf-7
utf-16be
utf-16le
utf-16
utf-8
iso-8859-13
iso-8859-14
iso-ir-199
iso_8859-14:1998
iso_8859-14
latin8
iso-celtic
l8
iso-8859-15
iso_8859-15
iso-8859-16
iso-ir-226
iso_8859-16:2001
iso_8859-16
latin10
l10
gbk
cp936
ms936
gb18030
shift_jis
ms_kanji
csshiftjis
euc-jp
gb2312
big5
csbig5
windows-1250
windows-1251
windows-1252
windows-1253
windows-1254
windows-1255
windows-1256
windows-1257
windows-1258
tis-620
hz-gb-2312
]
end

View file

@ -0,0 +1 @@
require 'html5/filters/optionaltags'

View file

@ -1,7 +1,7 @@
require 'delegate'
require 'enumerator'
module HTML5lib
module HTML5
module Filters
class Base < SimpleDelegator
include Enumerable

View file

@ -1,6 +1,6 @@
require 'html5lib/filters/base'
require 'html5/filters/base'
module HTML5lib
module HTML5
module Filters
class InjectMetaCharset < Base
def initialize(source, encoding)
@ -21,9 +21,9 @@ module HTML5lib
when :EmptyTag
if token[:name].downcase == "meta"
# replace charset with actual encoding
token[:data].each_with_index do |(name,value),index|
token[:data].each_with_index do |(name, value), index|
if name == 'charset'
token[:data][index][1]=@encoding
token[:data][index][1] = @encoding
meta_found = true
end
end
@ -31,7 +31,7 @@ module HTML5lib
# replace charset with actual encoding
has_http_equiv_content_type = false
content_index = -1
token[:data].each_with_index do |(name,value),i|
token[:data].each_with_index do |(name, value), i|
if name.downcase == 'charset'
token[:data][i] = ['charset', @encoding]
meta_found = true
@ -43,30 +43,27 @@ module HTML5lib
end
end
if not meta_found
if has_http_equiv_content_type and content_index >= 0
token[:data][content_index][1] =
'text/html; charset=%s' % @encoding
if !meta_found
if has_http_equiv_content_type && content_index >= 0
token[:data][content_index][1] = 'text/html; charset=%s' % @encoding
meta_found = true
end
end
elsif token[:name].downcase == "head" and not meta_found
elsif token[:name].downcase == "head" && !meta_found
# insert meta into empty head
yield(:type => :StartTag, :name => "head", :data => token[:data])
yield(:type => :EmptyTag, :name => "meta",
:data => [["charset", @encoding]])
yield(:type => :EndTag, :name => "head")
yield :type => :StartTag, :name => "head", :data => token[:data]
yield :type => :EmptyTag, :name => "meta", :data => [["charset", @encoding]]
yield :type => :EndTag, :name => "head"
meta_found = true
next
end
when :EndTag
if token[:name].downcase == "head" and pending.any?
if token[:name].downcase == "head" && pending.any?
# insert meta into head (if necessary) and flush pending queue
yield pending.shift
yield(:type => :EmptyTag, :name => "meta",
:data => [["charset", @encoding]]) if not meta_found
yield :type => :EmptyTag, :name => "meta", :data => [["charset", @encoding]] if !meta_found
yield pending.shift while pending.any?
meta_found = true
state = :post_head

View file

@ -1,7 +1,7 @@
require 'html5lib/constants'
require 'html5lib/filters/base'
require 'html5/constants'
require 'html5/filters/base'
module HTML5lib
module HTML5
module Filters
class OptionalTagFilter < Base
@ -75,8 +75,7 @@ module HTML5lib
if type == :StartTag
# omit the thead and tfoot elements' end tag when they are
# immediately followed by a tbody element. See is_optional_end.
if previous and previous[:type] == :EndTag and \
%w(tbody thead tfoot).include?(previous[:name])
if previous and previous[:type] == :EndTag && %w(tbody thead tfoot).include?(previous[:name])
return false
end
@ -85,7 +84,7 @@ module HTML5lib
return false
end
end
return false
return false
end
def is_optional_end(tagname, nexttok)

View file

@ -1,7 +1,7 @@
require 'html5lib/filters/base'
require 'html5lib/sanitizer'
require 'html5/filters/base'
require 'html5/sanitizer'
module HTML5lib
module HTML5
module Filters
class HTMLSanitizeFilter < Base
include HTMLSanitizeModule

View file

@ -1,7 +1,7 @@
require 'html5lib/constants'
require 'html5lib/filters/base'
require 'html5/constants'
require 'html5/filters/base'
module HTML5lib
module HTML5
module Filters
class WhitespaceFilter < Base
@ -21,7 +21,7 @@ module HTML5lib
preserve -= 1 if preserve > 0
when :SpaceCharacters
next if preserve == 0
token[:data] = " " if preserve == 0 && token[:data]
when :Characters
token[:data] = token[:data].sub(SPACES,' ') if preserve == 0

View file

@ -1,246 +1,248 @@
require 'html5lib/constants'
require 'html5lib/tokenizer'
require 'html5lib/treebuilders/rexml'
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
require 'html5lib/html5parser/' + File.basename(path)
end
module HTML5lib
# Error in parsed document
class ParseError < Exception; end
class AssertionError < Exception; end
# HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
#
class HTMLParser
attr_accessor :phase, :firstStartTag, :innerHTML, :lastPhase, :insertFromTable
attr_reader :phases, :tokenizer, :tree, :errors
def self.parse(stream, options = {})
encoding = options.delete(:encoding)
new(options).parse(stream,encoding)
end
def self.parseFragment(stream, options = {})
container = options.delete(:container) || 'div'
encoding = options.delete(:encoding)
new(options).parseFragment(stream,container,encoding)
end
@@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
# :strict - raise an exception when a parse error is encountered
# :tree - a treebuilder class controlling the type of tree that will be
# returned. Built in treebuilders can be accessed through
# HTML5lib::TreeBuilders[treeType]
def initialize(options = {})
@strict = false
@errors = []
@tokenizer = HTMLTokenizer
@tree = TreeBuilders::REXML::TreeBuilder
options.each { |name, value| instance_variable_set("@#{name}", value) }
@tree = @tree.new
@phases = @@phases.inject({}) do |phases, phase_name|
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
phases[phase_name.to_sym] = HTML5lib.const_get(phase_class_name).new(self, @tree)
phases
end
end
def _parse(stream, innerHTML, encoding, container = 'div')
@tree.reset
@firstStartTag = false
@errors = []
@tokenizer = @tokenizer.class unless Class === @tokenizer
@tokenizer = @tokenizer.new(stream, :encoding => encoding,
:parseMeta => !innerHTML)
if innerHTML
case @innerHTML = container.downcase
when 'title', 'textarea'
@tokenizer.contentModelFlag = :RCDATA
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
@tokenizer.contentModelFlag = :CDATA
when 'plaintext'
@tokenizer.contentModelFlag = :PLAINTEXT
else
# contentModelFlag already is PCDATA
#@tokenizer.contentModelFlag = :PCDATA
end
@phase = @phases[:rootElement]
@phase.insertHtmlElement
resetInsertionMode
else
@innerHTML = false
@phase = @phases[:initial]
end
# We only seem to have InBodyPhase testcases where the following is
# relevant ... need others too
@lastPhase = nil
# XXX This is temporary for the moment so there isn't any other
# changes needed for the parser to work with the iterable tokenizer
@tokenizer.each do |token|
token = normalizeToken(token)
method = 'process%s' % token[:type]
case token[:type]
when :Characters, :SpaceCharacters, :Comment
@phase.send method, token[:data]
when :StartTag
@phase.send method, token[:name], token[:data]
when :EndTag
@phase.send method, token[:name]
when :Doctype
@phase.send method, token[:name], token[:publicId],
token[:systemId], token[:correct]
else
parseError(token[:data])
end
end
# When the loop finishes it's EOF
@phase.processEOF
end
# Parse a HTML document into a well-formed tree
#
# stream - a filelike object or string containing the HTML to be parsed
#
# The optional encoding parameter must be a string that indicates
# the encoding. If specified, that encoding will be used,
# regardless of any BOM or later declaration (such as in a meta
# element)
def parse(stream, encoding=nil)
_parse(stream, false, encoding)
return @tree.getDocument
end
# Parse a HTML fragment into a well-formed tree fragment
# container - name of the element we're setting the innerHTML property
# if set to nil, default to 'div'
#
# stream - a filelike object or string containing the HTML to be parsed
#
# The optional encoding parameter must be a string that indicates
# the encoding. If specified, that encoding will be used,
# regardless of any BOM or later declaration (such as in a meta
# element)
def parseFragment(stream, container='div', encoding=nil)
_parse(stream, true, encoding, container)
return @tree.getFragment
end
def parseError(data = 'XXX ERROR MESSAGE NEEDED')
# XXX The idea is to make data mandatory.
@errors.push([@tokenizer.stream.position, data])
raise ParseError if @strict
end
# HTML5 specific normalizations to the token stream
def normalizeToken(token)
if token[:type] == :EmptyTag
# When a solidus (/) is encountered within a tag name what happens
# depends on whether the current tag name matches that of a void
# element. If it matches a void element atheists did the wrong
# thing and if it doesn't it's wrong for everyone.
unless VOID_ELEMENTS.include?(token[:name])
parseError(_('Solidus (/) incorrectly placed in tag.'))
end
token[:type] = :StartTag
end
if token[:type] == :StartTag
token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
# We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
unless token[:data].empty?
data = token[:data].reverse.map { |attr, value| [attr.tr(ASCII_UPPERCASE, ASCII_LOWERCASE), value] }
token[:data] = Hash[*data.flatten]
end
elsif token[:type] == :EndTag
parseError(_('End tag contains unexpected attributes.')) unless token[:data].empty?
token[:name] = token[:name].downcase
end
return token
end
@@new_modes = {
'select' => :inSelect,
'td' => :inCell,
'th' => :inCell,
'tr' => :inRow,
'tbody' => :inTableBody,
'thead' => :inTableBody,
'tfoot' => :inTableBody,
'caption' => :inCaption,
'colgroup' => :inColumnGroup,
'table' => :inTable,
'head' => :inBody,
'body' => :inBody,
'frameset' => :inFrameset
}
def resetInsertionMode
# The name of this method is mostly historical. (It's also used in the
# specification.)
last = false
@tree.openElements.reverse.each do |node|
nodeName = node.name
if node == @tree.openElements[0]
last = true
unless ['td', 'th'].include?(nodeName)
# XXX
# assert @innerHTML
nodeName = @innerHTML
end
end
# Check for conditions that should only happen in the innerHTML
# case
if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName)
# XXX
# assert @innerHTML
end
if @@new_modes.has_key?(nodeName)
@phase = @phases[@@new_modes[nodeName]]
elsif nodeName == 'html'
@phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead]
elsif last
@phase = @phases[:inBody]
else
next
end
break
end
end
def _(string); string; end
end
end
require 'html5/constants'
require 'html5/tokenizer'
require 'html5/treebuilders/rexml'
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
require 'html5/html5parser/' + File.basename(path)
end
module HTML5
# Error in parsed document
class ParseError < Exception; end
class AssertionError < Exception; end
# HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
#
class HTMLParser
attr_accessor :phase, :first_start_tag, :inner_html, :last_phase, :insert_from_table
attr_reader :phases, :tokenizer, :tree, :errors
def self.parse(stream, options = {})
encoding = options.delete(:encoding)
new(options).parse(stream,encoding)
end
def self.parse_fragment(stream, options = {})
container = options.delete(:container) || 'div'
encoding = options.delete(:encoding)
new(options).parse_fragment(stream, container, encoding)
end
@@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
# :strict - raise an exception when a parse error is encountered
# :tree - a treebuilder class controlling the type of tree that will be
# returned. Built in treebuilders can be accessed through
# HTML5::TreeBuilders[treeType]
def initialize(options = {})
@strict = false
@errors = []
@tokenizer = HTMLTokenizer
@tree = TreeBuilders::REXML::TreeBuilder
options.each {|name, value| instance_variable_set("@#{name}", value) }
@lowercase_attr_name = nil unless instance_variables.include?("@lowercase_attr_name")
@lowercase_element_name = nil unless instance_variables.include?("@lowercase_element_name")
@tree = @tree.new
@phases = @@phases.inject({}) do |phases, phase_name|
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
phases
end
end
def _parse(stream, inner_html, encoding, container = 'div')
@tree.reset
@first_start_tag = false
@errors = []
@tokenizer = @tokenizer.class unless Class === @tokenizer
@tokenizer = @tokenizer.new(stream, :encoding => encoding,
:parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)
if inner_html
case @inner_html = container.downcase
when 'title', 'textarea'
@tokenizer.content_model_flag = :RCDATA
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
@tokenizer.content_model_flag = :CDATA
when 'plaintext'
@tokenizer.content_model_flag = :PLAINTEXT
else
# content_model_flag already is PCDATA
#@tokenizer.content_model_flag = :PCDATA
end
@phase = @phases[:rootElement]
@phase.insert_html_element
reset_insertion_mode
else
@inner_html = false
@phase = @phases[:initial]
end
# We only seem to have InBodyPhase testcases where the following is
# relevant ... need others too
@last_phase = nil
# XXX This is temporary for the moment so there isn't any other
# changes needed for the parser to work with the iterable tokenizer
@tokenizer.each do |token|
token = normalize_token(token)
method = 'process%s' % token[:type]
case token[:type]
when :Characters, :SpaceCharacters, :Comment
@phase.send method, token[:data]
when :StartTag
@phase.send method, token[:name], token[:data]
when :EndTag
@phase.send method, token[:name]
when :Doctype
@phase.send method, token[:name], token[:publicId],
token[:systemId], token[:correct]
else
parse_error(token[:data])
end
end
# When the loop finishes it's EOF
@phase.process_eof
end
# Parse a HTML document into a well-formed tree
#
# stream - a filelike object or string containing the HTML to be parsed
#
# The optional encoding parameter must be a string that indicates
# the encoding. If specified, that encoding will be used,
# regardless of any BOM or later declaration (such as in a meta
# element)
def parse(stream, encoding=nil)
_parse(stream, false, encoding)
@tree.get_document
end
# Parse a HTML fragment into a well-formed tree fragment
# container - name of the element we're setting the inner_html property
# if set to nil, default to 'div'
#
# stream - a filelike object or string containing the HTML to be parsed
#
# The optional encoding parameter must be a string that indicates
# the encoding. If specified, that encoding will be used,
# regardless of any BOM or later declaration (such as in a meta
# element)
def parse_fragment(stream, container='div', encoding=nil)
_parse(stream, true, encoding, container)
@tree.get_fragment
end
def parse_error(data = 'XXX ERROR MESSAGE NEEDED')
# XXX The idea is to make data mandatory.
@errors.push([@tokenizer.stream.position, data])
raise ParseError if @strict
end
# HTML5 specific normalizations to the token stream
def normalize_token(token)
if token[:type] == :EmptyTag
# When a solidus (/) is encountered within a tag name what happens
# depends on whether the current tag name matches that of a void
# element. If it matches a void element atheists did the wrong
# thing and if it doesn't it's wrong for everyone.
unless VOID_ELEMENTS.include?(token[:name])
parse_error(_('Solidus (/) incorrectly placed in tag.'))
end
token[:type] = :StartTag
end
if token[:type] == :StartTag
token[:name] = token[:name].downcase
# We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
unless token[:data].empty?
data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
token[:data] = Hash[*data.flatten]
end
elsif token[:type] == :EndTag
parse_error(_('End tag contains unexpected attributes.')) unless token[:data].empty?
token[:name] = token[:name].downcase
end
token
end
@@new_modes = {
'select' => :inSelect,
'td' => :inCell,
'th' => :inCell,
'tr' => :inRow,
'tbody' => :inTableBody,
'thead' => :inTableBody,
'tfoot' => :inTableBody,
'caption' => :inCaption,
'colgroup' => :inColumnGroup,
'table' => :inTable,
'head' => :inBody,
'body' => :inBody,
'frameset' => :inFrameset
}
def reset_insertion_mode
# The name of this method is mostly historical. (It's also used in the
# specification.)
last = false
@tree.open_elements.reverse.each do |node|
node_name = node.name
if node == @tree.open_elements.first
last = true
unless ['td', 'th'].include?(node_name)
# XXX
# assert @inner_html
node_name = @inner_html
end
end
# Check for conditions that should only happen in the inner_html
# case
if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
# XXX
# assert @inner_html
end
if @@new_modes.has_key?(node_name)
@phase = @phases[@@new_modes[node_name]]
elsif node_name == 'html'
@phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
elsif last
@phase = @phases[:inBody]
else
next
end
break
end
end
def _(string); string; end
end
end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class AfterBodyPhase < Phase
handle_end 'html'
@ -8,36 +8,36 @@ module HTML5lib
def processComment(data)
# This is needed because data is to be appended to the <html> element
# here and not to whatever is currently open.
@tree.insertComment(data, @tree.openElements[0])
@tree.insert_comment(data, @tree.open_elements.first)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after body phase.'))
parse_error(_('Unexpected non-space characters in the after body phase.'))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the after body phase."))
parse_error(_("Unexpected start tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processStartTag(name, attributes)
end
def endTagHtml(name)
if @parser.innerHTML
@parser.parseError
if @parser.inner_html
parse_error
else
# XXX: This may need to be done, not sure
# Don't set lastPhase to the current phase but to the inBody phase
# Don't set last_phase to the current phase but to the inBody phase
# instead. No need for extra parse errors if there's something after </html>.
# Try "<!doctype html>X</html>X" for instance.
@parser.lastPhase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
@parser.last_phase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the after body phase."))
parse_error(_("Unexpected end tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processEndTag(name)
end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class AfterFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#after3
@ -10,7 +10,7 @@ module HTML5lib
handle_end 'html'
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after frameset phase. Ignored.'))
parse_error(_('Unexpected non-space characters in the after frameset phase. Ignored.'))
end
def startTagNoframes(name, attributes)
@ -18,16 +18,16 @@ module HTML5lib
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in the after frameset phase. Ignored."))
parse_error(_("Unexpected start tag (#{name}) in the after frameset phase. Ignored."))
end
def endTagHtml(name)
@parser.lastPhase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
@parser.last_phase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the after frameset phase. Ignored."))
parse_error(_("Unexpected end tag (#{name}) in the after frameset phase. Ignored."))
end
end

View file

@ -1,48 +1,48 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class AfterHeadPhase < Phase
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
def processEOF
anythingElse
@parser.phase.processEOF
def process_eof
anything_else
@parser.phase.process_eof
end
def processCharacters(data)
anythingElse
anything_else
@parser.phase.processCharacters(data)
end
def startTagBody(name, attributes)
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inBody]
end
def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inFrameset]
end
def startTagFromHead(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that can be in head. Moved."))
parse_error(_("Unexpected start tag (#{name}) that can be in head. Moved."))
@parser.phase = @parser.phases[:inHead]
@parser.phase.processStartTag(name, attributes)
end
def startTagOther(name, attributes)
anythingElse
anything_else
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
anythingElse
anything_else
@parser.phase.processEndTag(name)
end
def anythingElse
@tree.insertElement('body', {})
def anything_else
@tree.insert_element('body', {})
@parser.phase = @parser.phases[:inBody]
end

View file

@ -1,15 +1,15 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class BeforeHeadPhase < Phase
handle_start 'html', 'head'
handle_end %w( html head body br ) => 'ImplyHead'
handle_end %w( html head body br p ) => 'ImplyHead'
def processEOF
def process_eof
startTagHead('head', {})
@parser.phase.processEOF
@parser.phase.process_eof
end
def processCharacters(data)
@ -18,8 +18,8 @@ module HTML5lib
end
def startTagHead(name, attributes)
@tree.insertElement(name, attributes)
@tree.headPointer = @tree.openElements[-1]
@tree.insert_element(name, attributes)
@tree.head_pointer = @tree.open_elements[-1]
@parser.phase = @parser.phases[:inHead]
end
@ -34,7 +34,7 @@ module HTML5lib
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) after the (implied) root element."))
parse_error(_("Unexpected end tag (#{name}) after the (implied) root element."))
end
end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
@ -51,25 +51,40 @@ module HTML5lib
# for special handling of whitespace in <pre>
@processSpaceCharactersDropNewline = false
if $-w
$-w = false
alias processSpaceCharactersNonPre processSpaceCharacters
$-w = true
else
alias processSpaceCharactersNonPre processSpaceCharacters
end
end
def processSpaceCharactersDropNewline(data)
#Sometimes (start of <pre> blocks) we want to drop leading newlines
@processSpaceCharactersDropNewline = false
if (data.length > 0 and data[0] == ?\n and
%w[pre textarea].include?(@tree.openElements[-1].name) and
not @tree.openElements[-1].hasContent)
# #Sometimes (start of <pre> blocks) we want to drop leading newlines
if $-w
$-w = false
alias processSpaceCharacters processSpaceCharactersNonPre
$-w = true
else
alias processSpaceCharacters processSpaceCharactersNonPre
end
if (data.length > 0 and data[0] == ?\n &&
%w[pre textarea].include?(@tree.open_elements.last.name) && !@tree.open_elements.last.hasContent)
data = data[1..-1]
end
@tree.insertText(data) if data.length > 0
if data.length > 0
@tree.reconstructActiveFormattingElements
@tree.insertText(data)
end
end
def processSpaceCharacters(data)
if @processSpaceCharactersDropNewline
processSpaceCharactersDropNewline(data)
else
super(data)
end
@tree.reconstructActiveFormattingElements()
@tree.insertText(data)
end
def processCharacters(data)
@ -85,20 +100,19 @@ module HTML5lib
end
def startTagTitle(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that belongs in the head. Moved."))
parse_error(_("Unexpected start tag (#{name}) that belongs in the head. Moved."))
@parser.phases[:inHead].processStartTag(name, attributes)
end
def startTagBody(name, attributes)
@parser.parseError(_('Unexpected start tag (body).'))
parse_error(_('Unexpected start tag (body).'))
if (@tree.openElements.length == 1 or
@tree.openElements[1].name != 'body')
assert @parser.innerHTML
if (@tree.open_elements.length == 1 || @tree.open_elements[1].name != 'body')
assert @parser.inner_html
else
attributes.each do |attr, value|
unless @tree.openElements[1].attributes.has_key?(attr)
@tree.openElements[1].attributes[attr] = value
unless @tree.open_elements[1].attributes.has_key?(attr)
@tree.open_elements[1].attributes[attr] = value
end
end
end
@ -106,17 +120,17 @@ module HTML5lib
def startTagCloseP(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
@processSpaceCharactersDropNewline = true if name == 'pre'
end
def startTagForm(name, attributes)
if @tree.formPointer
@parser.parseError('Unexpected start tag (form). Ignored.')
parse_error(_('Unexpected start tag (form). Ignored.'))
else
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.formPointer = @tree.openElements[-1]
@tree.insert_element(name, attributes)
@tree.formPointer = @tree.open_elements[-1]
end
end
@ -125,31 +139,28 @@ module HTML5lib
stopNames = {'li' => ['li'], 'dd' => ['dd', 'dt'], 'dt' => ['dd', 'dt']}
stopName = stopNames[name]
@tree.openElements.reverse.each_with_index do |node, i|
@tree.open_elements.reverse.each_with_index do |node, i|
if stopName.include?(node.name)
poppedNodes = (0..i).collect { @tree.openElements.pop }
poppedNodes = (0..i).collect { @tree.open_elements.pop }
if i >= 1
@parser.parseError("Missing end tag%s (%s)" % [
(i>1 ? 's' : ''),
poppedNodes.reverse.map {|item| item.name}.join(', ')])
parse_error(_("Missing end tag%s (%s)" % [(i>1 ? 's' : ''), poppedNodes.reverse.map{|item| item.name}.join(', ')]))
end
break
end
# Phrasing elements are all non special, non scoping, non
# formatting elements
break if ((SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) and
not ['address', 'div'].include?(node.name))
break if ((SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) && !%w[address div].include?(node.name))
end
# Always insert an <li> element.
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
end
def startTagPlaintext(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :PLAINTEXT
@tree.insert_element(name, attributes)
@parser.tokenizer.content_model_flag = :PLAINTEXT
end
def startTagHeading(name, attributes)
@ -158,7 +169,7 @@ module HTML5lib
# Uncomment the following for IE7 behavior:
# HEADING_ELEMENTS.each do |element|
# if in_scope?(element)
# @parser.parseError(_("Unexpected start tag (#{name})."))
# parse_error(_("Unexpected start tag (#{name})."))
#
# remove_open_elements_until do |element|
# HEADING_ELEMENTS.include?(element.name)
@ -167,14 +178,14 @@ module HTML5lib
# break
# end
# end
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
end
def startTagA(name, attributes)
if afeAElement = @tree.elementInActiveFormattingElements('a')
@parser.parseError(_('Unexpected start tag (a) implies end tag (a).'))
parse_error(_('Unexpected start tag (a) implies end tag (a).'))
endTagFormatting('a')
@tree.openElements.delete(afeAElement) if @tree.openElements.include?(afeAElement)
@tree.open_elements.delete(afeAElement) if @tree.open_elements.include?(afeAElement)
@tree.activeFormattingElements.delete(afeAElement) if @tree.activeFormattingElements.include?(afeAElement)
end
@tree.reconstructActiveFormattingElements
@ -188,77 +199,82 @@ module HTML5lib
def startTagNobr(name, attributes)
@tree.reconstructActiveFormattingElements
processEndTag('nobr') if in_scope?('nobr')
if in_scope?('nobr')
parse_error(_('Unexpected start tag (nobr) implies end tag (nobr).'))
processEndTag('nobr')
# XXX Need tests that trigger the following
@tree.reconstructActiveFormattingElements
end
addFormattingElement(name, attributes)
end
def startTagButton(name, attributes)
if in_scope?('button')
@parser.parseError(_('Unexpected start tag (button) implied end tag (button).'))
parse_error(_('Unexpected start tag (button) implied end tag (button).'))
processEndTag('button')
@parser.phase.processStartTag(name, attributes)
else
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
@tree.activeFormattingElements.push(Marker)
end
end
def startTagMarqueeObject(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
@tree.activeFormattingElements.push(Marker)
end
def startTagXmp(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA
@tree.insert_element(name, attributes)
@parser.tokenizer.content_model_flag = :CDATA
end
def startTagTable(name, attributes)
processEndTag('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inTable]
end
def startTagVoidFormatting(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.openElements.pop
@tree.insert_element(name, attributes)
@tree.open_elements.pop
end
def startTagHr(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.openElements.pop
@tree.insert_element(name, attributes)
@tree.open_elements.pop
end
def startTagImage(name, attributes)
# No really...
@parser.parseError(_('Unexpected start tag (image). Treated as img.'))
parse_error(_('Unexpected start tag (image). Treated as img.'))
processStartTag('img', attributes)
end
def startTagInput(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
if @tree.formPointer
# XXX Not exactly sure what to do here
# @tree.openElements[-1].form = @tree.formPointer
# @tree.open_elements[-1].form = @tree.formPointer
end
@tree.openElements.pop
@tree.open_elements.pop
end
def startTagIsindex(name, attributes)
@parser.parseError("Unexpected start tag isindex. Don't use it!")
parse_error(_("Unexpected start tag isindex. Don't use it!"))
return if @tree.formPointer
processStartTag('form', {})
processStartTag('hr', {})
processStartTag('p', {})
processStartTag('label', {})
# XXX Localization ...
processCharacters('This is a searchable index. Insert your search keywords here:')
processCharacters('This is a searchable index. Insert your search keywords here: ')
attributes['name'] = 'isindex'
attrs = attributes.to_a
processStartTag('input', attributes)
@ -270,20 +286,21 @@ module HTML5lib
def startTagTextarea(name, attributes)
# XXX Form element pointer checking here as well...
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :RCDATA
@tree.insert_element(name, attributes)
@parser.tokenizer.content_model_flag = :RCDATA
@processSpaceCharactersDropNewline = true
alias processSpaceCharacters processSpaceCharactersDropNewline
end
# iframe, noembed noframes, noscript(if scripting enabled)
def startTagCdata(name, attributes)
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA
@tree.insert_element(name, attributes)
@parser.tokenizer.content_model_flag = :CDATA
end
def startTagSelect(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inSelect]
end
@ -293,7 +310,7 @@ module HTML5lib
# "caption", "col", "colgroup", "frame", "frameset", "head",
# "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
# "tr", "noscript"
@parser.parseError(_("Unexpected start tag (#{name}). Ignored."))
parse_error(_("Unexpected start tag (#{name}). Ignored."))
end
def startTagNew(name, attributes)
@ -306,33 +323,38 @@ module HTML5lib
def startTagOther(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
end
def endTagP(name)
@tree.generateImpliedEndTags('p') if in_scope?('p')
@parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p'
@tree.openElements.pop while in_scope?('p')
parse_error(_('Unexpected end tag (p).')) unless @tree.open_elements.last.name == 'p'
if in_scope?('p')
@tree.open_elements.pop while in_scope?('p')
else
startTagCloseP('p', {})
endTagP('p')
end
end
def endTagBody(name)
# XXX Need to take open <p> tags into account here. We shouldn't imply
# </p> but we should not throw a parse error either. Specification is
# likely to be updated.
unless @tree.openElements[1].name == 'body'
# innerHTML case
@parser.parseError
unless @tree.open_elements[1].name == 'body'
# inner_html case
parse_error
return
end
unless @tree.openElements[-1].name == 'body'
@parser.parseError(_("Unexpected end tag (body). Missing end tag (#{@tree.openElements[-1].name})."))
unless @tree.open_elements.last.name == 'body'
parse_error(_("Unexpected end tag (body). Missing end tag (#{@tree.open_elements[-1].name})."))
end
@parser.phase = @parser.phases[:afterBody]
end
def endTagHtml(name)
endTagBody(name)
@parser.phase.processEndTag(name) unless @parser.innerHTML
@parser.phase.processEndTag(name) unless @parser.inner_html
end
def endTagBlock(name)
@ -341,8 +363,8 @@ module HTML5lib
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
unless @tree.open_elements.last.name == name
parse_error(_("End tag (#{name}) seen too early. Expected other end tag."))
end
if in_scope?(name)
@ -351,18 +373,23 @@ module HTML5lib
end
def endTagForm(name)
endTagBlock(name)
if in_scope?(name)
@tree.generateImpliedEndTags
end
if @tree.open_elements.last.name != name
parse_error(_("End tag (form) seen too early. Ignored."))
else
@tree.open_elements.pop
end
@tree.formPointer = nil
end
def endTagListItem(name)
# AT Could merge this with the Block case
if in_scope?(name)
@tree.generateImpliedEndTags(name)
@tree.generateImpliedEndTags(name) if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
end
unless @tree.open_elements.last.name == name
parse_error(_("End tag (#{name}) seen too early. " + 'Expected other end tag.'))
end
remove_open_elements_until(name) if in_scope?(name)
@ -376,13 +403,13 @@ module HTML5lib
end
end
unless @tree.openElements[-1].name == name
@parser.parseError(("Unexpected end tag (#{name}). Expected other end tag."))
unless @tree.open_elements.last.name == name
parse_error(_("Unexpected end tag (#{name}). Expected other end tag."))
end
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) }
remove_open_elements_until {|element| HEADING_ELEMENTS.include?(element.name)}
break
end
end
@ -391,30 +418,30 @@ module HTML5lib
# The much-feared adoption agency algorithm
def endTagFormatting(name)
# http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
# XXX Better parseError messages appreciated.
# XXX Better parse_error messages appreciated.
while true
# Step 1 paragraph 1
afeElement = @tree.elementInActiveFormattingElements(name)
if not afeElement or (@tree.openElements.include?(afeElement) and not in_scope?(afeElement.name))
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 1 of the adoption agency algorithm."))
if !afeElement or (@tree.open_elements.include?(afeElement) && !in_scope?(afeElement.name))
parse_error(_("End tag (#{name}) violates step 1, paragraph 1 of the adoption agency algorithm."))
return
# Step 1 paragraph 2
elsif not @tree.openElements.include?(afeElement)
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 2 of the adoption agency algorithm."))
elsif not @tree.open_elements.include?(afeElement)
parse_error(_("End tag (#{name}) violates step 1, paragraph 2 of the adoption agency algorithm."))
@tree.activeFormattingElements.delete(afeElement)
return
end
# Step 1 paragraph 3
if afeElement != @tree.openElements[-1]
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 3 of the adoption agency algorithm."))
if afeElement != @tree.open_elements.last
parse_error(_("End tag (#{name}) violates step 1, paragraph 3 of the adoption agency algorithm."))
end
# Step 2
# Start of the adoption agency algorithm proper
afeIndex = @tree.openElements.index(afeElement)
afeIndex = @tree.open_elements.index(afeElement)
furthestBlock = nil
@tree.openElements[afeIndex..-1].each do |element|
@tree.open_elements[afeIndex..-1].each do |element|
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(element.name)
furthestBlock = element
break
@ -423,11 +450,11 @@ module HTML5lib
# Step 3
if furthestBlock.nil?
element = remove_open_elements_until { |element| element == afeElement }
element = remove_open_elements_until {|element| element == afeElement }
@tree.activeFormattingElements.delete(element)
return
end
commonAncestor = @tree.openElements[afeIndex - 1]
commonAncestor = @tree.open_elements[afeIndex - 1]
# Step 5
furthestBlock.parent.removeChild(furthestBlock) if furthestBlock.parent
@ -444,11 +471,11 @@ module HTML5lib
while true
# AT replace this with a function and recursion?
# Node is element before node in open elements
node = @tree.openElements[@tree.openElements.index(node) - 1]
node = @tree.open_elements[@tree.open_elements.index(node) - 1]
until @tree.activeFormattingElements.include?(node)
tmpNode = node
node = @tree.openElements[@tree.openElements.index(node) - 1]
@tree.openElements.delete(tmpNode)
node = @tree.open_elements[@tree.open_elements.index(node) - 1]
@tree.open_elements.delete(tmpNode)
end
# Step 7.3
break if node == afeElement
@ -465,7 +492,7 @@ module HTML5lib
clone = node.cloneNode
# Replace node with clone
@tree.activeFormattingElements[@tree.activeFormattingElements.index(node)] = clone
@tree.openElements[@tree.openElements.index(node)] = clone
@tree.open_elements[@tree.open_elements.index(node)] = clone
node = clone
end
# Step 7.6
@ -495,47 +522,47 @@ module HTML5lib
@tree.activeFormattingElements.insert([bookmark,@tree.activeFormattingElements.length].min, clone)
# Step 13
@tree.openElements.delete(afeElement)
@tree.openElements.insert(@tree.openElements.index(furthestBlock) + 1, clone)
@tree.open_elements.delete(afeElement)
@tree.open_elements.insert(@tree.open_elements.index(furthestBlock) + 1, clone)
end
end
def endTagButtonMarqueeObject(name)
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag first."))
unless @tree.open_elements.last.name == name
parse_error(_("Unexpected end tag (#{name}). Expected other end tag first."))
end
if in_scope?(name)
remove_open_elements_until(name)
@tree.clearActiveFormattingElements
end
end
def endTagMisplaced(name)
# This handles elements with end tags in other insertion modes.
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
parse_error(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagBr(name)
@parser.parseError(_("Unexpected end tag (br). Treated as br element."))
parse_error(_("Unexpected end tag (br). Treated as br element."))
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, {})
@tree.openElements.pop()
@tree.insert_element(name, {})
@tree.open_elements.pop()
end
def endTagNone(name)
# This handles elements with no end tag.
@parser.parseError(_("This tag (#{name}) has no end tag"))
parse_error(_("This tag (#{name}) has no end tag"))
end
def endTagCdataTextAreaXmp(name)
if @tree.openElements[-1].name == name
@tree.openElements.pop
if @tree.open_elements.last.name == name
@tree.open_elements.pop
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
parse_error(_("Unexpected end tag (#{name}). Ignored."))
end
end
@ -549,20 +576,20 @@ module HTML5lib
def endTagOther(name)
# XXX This logic should be moved into the treebuilder
@tree.openElements.reverse.each do |node|
@tree.open_elements.reverse.each do |node|
if node.name == name
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == name
@parser.parseError(_("Unexpected end tag (#{name})."))
unless @tree.open_elements.last.name == name
parse_error(_("Unexpected end tag (#{name})."))
end
remove_open_elements_until { |element| element == node }
remove_open_elements_until {|element| element == node }
break
else
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
parse_error(_("Unexpected end tag (#{name}). Ignored."))
break
end
end
@ -572,8 +599,8 @@ module HTML5lib
protected
def addFormattingElement(name, attributes)
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(@tree.openElements[-1])
@tree.insert_element(name, attributes)
@tree.activeFormattingElements.push(@tree.open_elements.last)
end
end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InCaptionPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
@ -10,7 +10,7 @@ module HTML5lib
handle_end 'caption', 'table', %w( body col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def ignoreEndTagCaption
not in_scope?('caption', true)
!in_scope?('caption', true)
end
def processCharacters(data)
@ -18,7 +18,7 @@ module HTML5lib
end
def startTagTableElement(name, attributes)
@parser.parseError
parse_error
#XXX Have to duplicate logic here to find out if the tag is ignored
ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption')
@ -31,15 +31,15 @@ module HTML5lib
def endTagCaption(name)
if ignoreEndTagCaption
# innerHTML case
assert @parser.innerHTML
@parser.parseError
# inner_html case
assert @parser.inner_html
parse_error
else
# AT this code is quite similar to endTagTable in "InTable"
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'caption'
@parser.parseError(_("Unexpected end tag (caption). Missing end tags."))
unless @tree.open_elements[-1].name == 'caption'
parse_error(_("Unexpected end tag (caption). Missing end tags."))
end
remove_open_elements_until('caption')
@ -50,14 +50,14 @@ module HTML5lib
end
def endTagTable(name)
@parser.parseError
parse_error
ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption')
@parser.phase.processEndTag(name) unless ignoreEndTag
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
parse_error(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagOther(name)

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InCellPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
@ -20,8 +20,8 @@ module HTML5lib
closeCell
@parser.phase.processStartTag(name, attributes)
else
# innerHTML case
@parser.parseError
# inner_html case
parse_error
end
end
@ -32,22 +32,22 @@ module HTML5lib
def endTagTableCell(name)
if in_scope?(name, true)
@tree.generateImpliedEndTags(name)
if @tree.openElements[-1].name != name
@parser.parseError("Got table cell end tag (#{name}) while required end tags are missing.")
if @tree.open_elements.last.name != name
parse_error("Got table cell end tag (#{name}) while required end tags are missing.")
remove_open_elements_until(name)
else
@tree.openElements.pop
@tree.open_elements.pop
end
@tree.clearActiveFormattingElements
@parser.phase = @parser.phases[:inRow]
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
parse_error(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
parse_error(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagImply(name)
@ -55,8 +55,8 @@ module HTML5lib
closeCell
@parser.phase.processEndTag(name)
else
# sometimes innerHTML case
@parser.parseError
# sometimes inner_html case
parse_error
end
end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InColumnGroupPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
@ -10,7 +10,7 @@ module HTML5lib
handle_end 'colgroup', 'col'
def ignoreEndTagColgroup
@tree.openElements[-1].name == 'html'
@tree.open_elements[-1].name == 'html'
end
def processCharacters(data)
@ -20,8 +20,8 @@ module HTML5lib
end
def startTagCol(name, attributes)
@tree.insertElement(name, attributes)
@tree.openElements.pop
@tree.insert_element(name, attributes)
@tree.open_elements.pop
end
def startTagOther(name, attributes)
@ -32,17 +32,17 @@ module HTML5lib
def endTagColgroup(name)
if ignoreEndTagColgroup
# innerHTML case
assert @parser.innerHTML
@parser.parseError
# inner_html case
assert @parser.inner_html
parse_error
else
@tree.openElements.pop
@tree.open_elements.pop
@parser.phase = @parser.phases[:inTable]
end
end
def endTagCol(name)
@parser.parseError(_('Unexpected end tag (col). col has no end tag.'))
parse_error(_('Unexpected end tag (col). col has no end tag.'))
end
def endTagOther(name)

View file

@ -0,0 +1,57 @@
require 'html5/html5parser/phase'
module HTML5
class InFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
handle_start 'html', 'frameset', 'frame', 'noframes'
handle_end 'frameset', 'noframes'
def processCharacters(data)
parse_error(_('Unexpected characters in the frameset phase. Characters ignored.'))
end
def startTagFrameset(name, attributes)
@tree.insert_element(name, attributes)
end
def startTagFrame(name, attributes)
@tree.insert_element(name, attributes)
@tree.open_elements.pop
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
parse_error(_("Unexpected start tag token (#{name}) in the frameset phase. Ignored"))
end
def endTagFrameset(name)
if @tree.open_elements.last.name == 'html'
# inner_html case
parse_error(_("Unexpected end tag token (frameset) in the frameset phase (inner_html)."))
else
@tree.open_elements.pop
end
if (not @parser.inner_html and
@tree.open_elements.last.name != 'frameset')
# If we're not in inner_html mode and the the current node is not a
# "frameset" element (anymore) then switch.
@parser.phase = @parser.phases[:afterFrameset]
end
end
def endTagNoframes(name)
@parser.phases[:inBody].processEndTag(name)
end
def endTagOther(name)
parse_error(_("Unexpected end tag token (#{name}) in the frameset phase. Ignored."))
end
end
end

View file

@ -0,0 +1,138 @@
require 'html5/html5parser/phase'
module HTML5
class InHeadPhase < Phase
handle_start 'html', 'head', 'title', 'style', 'script', 'noscript'
handle_start %w( base link meta )
handle_end 'head'
handle_end %w( html body br p ) => 'ImplyAfterHead'
handle_end %w( title style script noscript )
def process_eof
if ['title', 'style', 'script'].include?(name = @tree.open_elements.last.name)
parse_error(_("Unexpected end of file. Expected end tag (#{name})."))
@tree.open_elements.pop
end
anything_else
@parser.phase.process_eof
end
def processCharacters(data)
if %w[title style script noscript].include?(@tree.open_elements.last.name)
@tree.insertText(data)
else
anything_else
@parser.phase.processCharacters(data)
end
end
def startTagHead(name, attributes)
parse_error(_('Unexpected start tag head in existing head. Ignored'))
end
def startTagTitle(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
@tree.open_elements.push(element)
@parser.tokenizer.content_model_flag = :RCDATA
end
def startTagStyle(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.head_pointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.open_elements.last.appendChild(element)
end
@tree.open_elements.push(element)
@parser.tokenizer.content_model_flag = :CDATA
end
def startTagNoscript(name, attributes)
# XXX Need to decide whether to implement the scripting disabled case.
element = @tree.createElement(name, attributes)
if @tree.head_pointer !=nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.open_elements.last.appendChild(element)
end
@tree.open_elements.push(element)
@parser.tokenizer.content_model_flag = :CDATA
end
def startTagScript(name, attributes)
#XXX Inner HTML case may be wrong
element = @tree.createElement(name, attributes)
element._flags.push("parser-inserted")
if @tree.head_pointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.open_elements.last.appendChild(element)
end
@tree.open_elements.push(element)
@parser.tokenizer.content_model_flag = :CDATA
end
def startTagBaseLinkMeta(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.head_pointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.open_elements.last.appendChild(element)
end
end
def startTagOther(name, attributes)
anything_else
@parser.phase.processStartTag(name, attributes)
end
def endTagHead(name)
if @tree.open_elements.last.name == 'head'
@tree.open_elements.pop
else
parse_error(_("Unexpected end tag (head). Ignored."))
end
@parser.phase = @parser.phases[:afterHead]
end
def endTagImplyAfterHead(name)
anything_else
@parser.phase.processEndTag(name)
end
def endTagTitleStyleScriptNoscript(name)
if @tree.open_elements.last.name == name
@tree.open_elements.pop
else
parse_error(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagOther(name)
parse_error(_("Unexpected end tag (#{name}). Ignored."))
end
def anything_else
if @tree.open_elements.last.name == 'head'
endTagHead('head')
else
@parser.phase = @parser.phases[:afterHead]
end
end
protected
def appendToHead(element)
if @tree.head_pointer.nil?
assert @parser.inner_html
@tree.open_elements.last.appendChild(element)
else
@tree.head_pointer.appendChild(element)
end
end
end
end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InRowPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
@ -15,7 +15,7 @@ module HTML5lib
def startTagTableCell(name, attributes)
clearStackToTableRowContext
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inCell]
@tree.activeFormattingElements.push(Marker)
end
@ -23,7 +23,7 @@ module HTML5lib
def startTagTableOther(name, attributes)
ignoreEndTag = ignoreEndTagTr
endTagTr('tr')
# XXX how are we sure it's always ignored in the innerHTML case?
# XXX how are we sure it's always ignored in the inner_html case?
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
@ -33,12 +33,12 @@ module HTML5lib
def endTagTr(name)
if ignoreEndTagTr
# innerHTML case
assert @parser.innerHTML
@parser.parseError
# inner_html case
assert @parser.inner_html
parse_error
else
clearStackToTableRowContext
@tree.openElements.pop
@tree.open_elements.pop
@parser.phase = @parser.phases[:inTableBody]
end
end
@ -47,7 +47,7 @@ module HTML5lib
ignoreEndTag = ignoreEndTagTr
endTagTr('tr')
# Reprocess the current tag if the tr end tag was not ignored
# XXX how are we sure it's always ignored in the innerHTML case?
# XXX how are we sure it's always ignored in the inner_html case?
@parser.phase.processEndTag(name) unless ignoreEndTag
end
@ -56,13 +56,13 @@ module HTML5lib
endTagTr('tr')
@parser.phase.processEndTag(name)
else
# innerHTML case
@parser.parseError
# inner_html case
parse_error
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the row phase. Ignored."))
parse_error(_("Unexpected end tag (#{name}) in the row phase. Ignored."))
end
def endTagOther(name)
@ -73,9 +73,9 @@ module HTML5lib
# XXX unify this with other table helper methods
def clearStackToTableRowContext
until ['tr', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the row phase."))
@tree.openElements.pop
until %w[tr html].include?(name = @tree.open_elements.last.name)
parse_error(_("Unexpected implied end tag (#{name}) in the row phase."))
@tree.open_elements.pop
end
end

View file

@ -0,0 +1,84 @@
require 'html5/html5parser/phase'
module HTML5
class InSelectPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
handle_start 'html', 'option', 'optgroup', 'select'
handle_end 'option', 'optgroup', 'select', %w( caption table tbody tfoot thead tr td th ) => 'TableElements'
def processCharacters(data)
@tree.insertText(data)
end
def startTagOption(name, attributes)
# We need to imply </option> if <option> is the current node.
@tree.open_elements.pop if @tree.open_elements.last.name == 'option'
@tree.insert_element(name, attributes)
end
def startTagOptgroup(name, attributes)
@tree.open_elements.pop if @tree.open_elements.last.name == 'option'
@tree.open_elements.pop if @tree.open_elements.last.name == 'optgroup'
@tree.insert_element(name, attributes)
end
def startTagSelect(name, attributes)
parse_error(_('Unexpected start tag (select) in the select phase implies select start tag.'))
endTagSelect('select')
end
def startTagOther(name, attributes)
parse_error(_('Unexpected start tag token (#{name}) in the select phase. Ignored.'))
end
def endTagOption(name)
if @tree.open_elements.last.name == 'option'
@tree.open_elements.pop
else
parse_error(_('Unexpected end tag (option) in the select phase. Ignored.'))
end
end
def endTagOptgroup(name)
# </optgroup> implicitly closes <option>
if @tree.open_elements.last.name == 'option' and @tree.open_elements[-2].name == 'optgroup'
@tree.open_elements.pop
end
# It also closes </optgroup>
if @tree.open_elements.last.name == 'optgroup'
@tree.open_elements.pop
# But nothing else
else
parse_error(_('Unexpected end tag (optgroup) in the select phase. Ignored.'))
end
end
def endTagSelect(name)
if in_scope?('select', true)
remove_open_elements_until('select')
@parser.reset_insertion_mode
else
# inner_html case
parse_error
end
end
def endTagTableElements(name)
parse_error(_("Unexpected table end tag (#{name}) in the select phase."))
if in_scope?(name, true)
endTagSelect('select')
@parser.phase.processEndTag(name)
end
end
def endTagOther(name)
parse_error(_("Unexpected end tag token (#{name}) in the select phase. Ignored."))
end
end
end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InTableBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
@ -15,12 +15,12 @@ module HTML5lib
def startTagTr(name, attributes)
clearStackToTableBodyContext
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inRow]
end
def startTagTableCell(name, attributes)
@parser.parseError(_("Unexpected table cell start tag (#{name}) in the table body phase."))
parse_error(_("Unexpected table cell start tag (#{name}) in the table body phase."))
startTagTr('tr', {})
@parser.phase.processStartTag(name, attributes)
end
@ -29,11 +29,11 @@ module HTML5lib
# XXX AT Any ideas on how to share this with endTagTable?
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name)
endTagTableRowGroup(@tree.open_elements.last.name)
@parser.phase.processStartTag(name, attributes)
else
# innerHTML case
@parser.parseError
# inner_html case
parse_error
end
end
@ -44,26 +44,26 @@ module HTML5lib
def endTagTableRowGroup(name)
if in_scope?(name, true)
clearStackToTableBodyContext
@tree.openElements.pop
@tree.open_elements.pop
@parser.phase = @parser.phases[:inTable]
else
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
parse_error(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end
end
def endTagTable(name)
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name)
endTagTableRowGroup(@tree.open_elements.last.name)
@parser.phase.processEndTag(name)
else
# innerHTML case
@parser.parseError
# inner_html case
parse_error
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
parse_error(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end
def endTagOther(name)
@ -73,9 +73,9 @@ module HTML5lib
protected
def clearStackToTableBodyContext
until ['tbody', 'tfoot', 'thead', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table body phase."))
@tree.openElements.pop
until %w[tbody tfoot thead html].include?(name = @tree.open_elements.last.name)
parse_error(_("Unexpected implied end tag (#{name}) in the table body phase."))
@tree.open_elements.pop
end
end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InTablePhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
@ -12,24 +12,24 @@ module HTML5lib
handle_end 'table', %w( body caption col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def processCharacters(data)
@parser.parseError(_("Unexpected non-space characters in table context caused voodoo mode."))
parse_error(_("Unexpected non-space characters in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
@tree.insert_from_table = true
# Process the character in the "in body" mode
@parser.phases[:inBody].processCharacters(data)
@tree.insertFromTable = false
@tree.insert_from_table = false
end
def startTagCaption(name, attributes)
clearStackToTableContext
@tree.activeFormattingElements.push(Marker)
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inCaption]
end
def startTagColgroup(name, attributes)
clearStackToTableContext
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inColumnGroup]
end
@ -40,7 +40,7 @@ module HTML5lib
def startTagRowGroup(name, attributes)
clearStackToTableContext
@tree.insertElement(name, attributes)
@tree.insert_element(name, attributes)
@parser.phase = @parser.phases[:inTableBody]
end
@ -50,60 +50,60 @@ module HTML5lib
end
def startTagTable(name, attributes)
@parser.parseError(_("Unexpected start tag (table) in table phase. Implies end tag (table)."))
parse_error(_("Unexpected start tag (table) in table phase. Implies end tag (table)."))
@parser.phase.processEndTag('table')
@parser.phase.processStartTag(name, attributes) unless @parser.innerHTML
@parser.phase.processStartTag(name, attributes) unless @parser.inner_html
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in table context caused voodoo mode."))
parse_error(_("Unexpected start tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
@tree.insert_from_table = true
# Process the start tag in the "in body" mode
@parser.phases[:inBody].processStartTag(name, attributes)
@tree.insertFromTable = false
@tree.insert_from_table = false
end
def endTagTable(name)
if in_scope?('table', true)
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'table'
@parser.parseError(_("Unexpected end tag (table). Expected end tag (#{@tree.openElements[-1].name})."))
unless @tree.open_elements.last.name == 'table'
parse_error(_("Unexpected end tag (table). Expected end tag (#{@tree.open_elements.last.name})."))
end
remove_open_elements_until('table')
@parser.resetInsertionMode
@parser.reset_insertion_mode
else
# innerHTML case
assert @parser.innerHTML
@parser.parseError
# inner_html case
assert @parser.inner_html
parse_error
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
parse_error(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in table context caused voodoo mode."))
parse_error(_("Unexpected end tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
@tree.insert_from_table = true
# Process the end tag in the "in body" mode
@parser.phases[:inBody].processEndTag(name)
@tree.insertFromTable = false
@tree.insert_from_table = false
end
protected
def clearStackToTableContext
# "clear the stack back to a table context"
until ['table', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table phase."))
@tree.openElements.pop
until %w[table html].include?(name = @tree.open_elements.last.name)
parse_error(_("Unexpected implied end tag (#{name}) in the table phase."))
@tree.open_elements.pop
end
# When the current node is <html> it's an innerHTML case
# When the current node is <html> it's an inner_html case
end
end

View file

@ -1,28 +1,28 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InitialPhase < Phase
# This phase deals with error handling as well which is currently not
# covered in the specification. The error handling is typically known as
# "quirks mode". It is expected that a future version of HTML5 will define this.
def processEOF
@parser.parseError(_('Unexpected End of file. Expected DOCTYPE.'))
def process_eof
parse_error(_('Unexpected End of file. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processEOF
@parser.phase.process_eof
end
def processComment(data)
@tree.insertComment(data, @tree.document)
@tree.insert_comment(data, @tree.document)
end
def processDoctype(name, publicId, systemId, correct)
if name.downcase != 'html' or publicId or systemId
@parser.parseError(_('Erroneous DOCTYPE.'))
parse_error(_('Erroneous DOCTYPE.'))
end
# XXX need to update DOCTYPE tokens
@tree.insertDoctype(name)
@tree.insertDoctype(name, publicId, systemId)
publicId = publicId.to_s.upcase
@ -110,23 +110,22 @@ module HTML5lib
end
def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected DOCTYPE.'))
parse_error(_('Unexpected non-space characters. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}). Expected DOCTYPE."))
parse_error(_("Unexpected start tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
@parser.parseError(_("Unexpected end tag (#{name}). Expected DOCTYPE."))
parse_error(_("Unexpected end tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processEndTag(name)
end

View file

@ -1,4 +1,4 @@
module HTML5lib
module HTML5
# Base class for helper objects that implement each phase of processing.
#
# Handler methods should be in the following order (they can be omitted):
@ -15,9 +15,12 @@ module HTML5lib
#
class Phase
extend Forwardable
def_delegators :@parser, :parse_error
# The following example call:
#
# tag_handlers('startTag', 'html', %( base link meta ), %( li dt dd ) => 'ListItem')
# tag_handlers('startTag', 'html', %w( base link meta ), %w( li dt dd ) => 'ListItem')
#
# ...would return a hash equal to this:
#
@ -34,15 +37,15 @@ module HTML5lib
if tags.last.is_a?(Hash)
tags.pop.each do |names, handler_method_suffix|
handler_method = prefix + handler_method_suffix
Array(names).each { |name| mapping[name] = handler_method }
Array(names).each {|name| mapping[name] = handler_method }
end
end
tags.each do |names|
names = Array(names)
handler_method = prefix + names.map { |name| name.capitalize }.join
names.each { |name| mapping[name] = handler_method }
handler_method = prefix + names.map {|name| name.capitalize }.join
names.each {|name| mapping[name] = handler_method }
end
return mapping
mapping
end
def self.start_tag_handlers
@ -80,17 +83,17 @@ module HTML5lib
@parser, @tree = parser, tree
end
def processEOF
def process_eof
@tree.generateImpliedEndTags
if @tree.openElements.length > 2
@parser.parseError(_('Unexpected end of file. Missing closing tags.'))
elsif @tree.openElements.length == 2 and @tree.openElements[1].name != 'body'
if @tree.open_elements.length > 2
parse_error(_('Unexpected end of file. Missing closing tags.'))
elsif @tree.open_elements.length == 2 and @tree.open_elements[1].name != 'body'
# This happens for framesets or something?
@parser.parseError(_("Unexpected end of file. Expected end tag (#{@tree.openElements[1].name}) first."))
elsif @parser.innerHTML and @tree.openElements.length > 1
parse_error(_("Unexpected end of file. Expected end tag (#{@tree.open_elements[1].name}) first."))
elsif @parser.inner_html and @tree.open_elements.length > 1
# XXX This is not what the specification says. Not sure what to do here.
@parser.parseError(_('XXX innerHTML EOF'))
parse_error(_('XXX inner_html EOF'))
end
# Betting ends.
end
@ -98,11 +101,11 @@ module HTML5lib
def processComment(data)
# For most phases the following is correct. Where it's not it will be
# overridden.
@tree.insertComment(data, @tree.openElements[-1])
@tree.insert_comment(data, @tree.open_elements.last)
end
def processDoctype(name, publicId, systemId, correct)
@parser.parseError(_('Unexpected DOCTYPE. Ignored.'))
parse_error(_('Unexpected DOCTYPE. Ignored.'))
end
def processSpaceCharacters(data)
@ -114,17 +117,17 @@ module HTML5lib
end
def startTagHtml(name, attributes)
if @parser.firstStartTag == false and name == 'html'
@parser.parseError(_('html needs to be the first start tag.'))
if @parser.first_start_tag == false and name == 'html'
parse_error(_('html needs to be the first start tag.'))
end
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke @parser.parseError.
# this token... If it's not, invoke parse_error.
attributes.each do |attr, value|
unless @tree.openElements[0].attributes.has_key?(attr)
@tree.openElements[0].attributes[attr] = value
unless @tree.open_elements.first.attributes.has_key?(attr)
@tree.open_elements.first.attributes[attr] = value
end
end
@parser.firstStartTag = false
@parser.first_start_tag = false
end
def processEndTag(name)
@ -146,11 +149,10 @@ module HTML5lib
def remove_open_elements_until(name=nil)
finished = false
until finished
element = @tree.openElements.pop
finished = name.nil?? yield(element) : element.name == name
element = @tree.open_elements.pop
finished = name.nil? ? yield(element) : element.name == name
end
return element
end
end
end

View file

@ -1,40 +1,39 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class RootElementPhase < Phase
def processEOF
insertHtmlElement
@parser.phase.processEOF
def process_eof
insert_html_element
@parser.phase.process_eof
end
def processComment(data)
@tree.insertComment(data, @tree.document)
@tree.insert_comment(data, @tree.document)
end
def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end
def processCharacters(data)
insertHtmlElement
insert_html_element
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.firstStartTag = true if name == 'html'
insertHtmlElement
@parser.first_start_tag = true if name == 'html'
insert_html_element
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
insertHtmlElement
insert_html_element
@parser.phase.processEndTag(name)
end
def insertHtmlElement
def insert_html_element
element = @tree.createElement('html', {})
@tree.openElements.push(element)
@tree.open_elements.push(element)
@tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead]
end

View file

@ -0,0 +1,35 @@
require 'html5/html5parser/phase'
module HTML5
class TrailingEndPhase < Phase
def process_eof
end
def processComment(data)
@tree.insert_comment(data, @tree.document)
end
def processSpaceCharacters(data)
@parser.last_phase.processSpaceCharacters(data)
end
def processCharacters(data)
parse_error(_('Unexpected non-space characters. Expected end of file.'))
@parser.phase = @parser.last_phase
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
parse_error(_('Unexpected start tag (#{name}). Expected end of file.'))
@parser.phase = @parser.last_phase
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
parse_error(_('Unexpected end tag (#{name}). Expected end of file.'))
@parser.phase = @parser.last_phase
@parser.phase.processEndTag(name)
end
end
end

View file

@ -1,7 +1,7 @@
require 'stringio'
require 'html5lib/constants'
require 'html5/constants'
module HTML5lib
module HTML5
# Provides a unicode stream of characters to the HTMLTokenizer.
@ -10,7 +10,7 @@ module HTML5lib
class HTMLInputStream
attr_accessor :queue, :char_encoding
attr_accessor :queue, :char_encoding, :errors
# Initialises the HTMLInputStream.
#
@ -27,11 +27,11 @@ module HTML5lib
# parseMeta - Look for a <meta> element containing encoding information
def initialize(source, options = {})
@encoding = nil
@encoding = nil
@parse_meta = true
@chardet = true
@chardet = true
options.each { |name, value| instance_variable_set("@#{name}", value) }
options.each {|name, value| instance_variable_set("@#{name}", value) }
# Raw Stream
@raw_stream = open_stream(source)
@ -40,25 +40,31 @@ module HTML5lib
#Number of bytes to use when looking for a meta element with
#encoding information
@NUM_BYTES_META = 512
#Number of bytes to use when using detecting encoding using chardet
@NUM_BYTES_CHARDET = 256
#Number of bytes to use when reading content
@NUM_BYTES_BUFFER = 1024
#Encoding to use if no other information can be found
@DEFAULT_ENCODING = 'windows-1252'
#Detect encoding iff no explicit "transport level" encoding is supplied
if @encoding.nil? or not HTML5lib.is_valid_encoding(@encoding)
if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
@char_encoding = detect_encoding
else
@char_encoding = @encoding
end
# Read bytes from stream decoding them into Unicode
uString = @raw_stream.read
@buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
if @char_encoding == 'windows-1252'
@win1252 = true
elsif @char_encoding != 'utf-8'
begin
require 'iconv'
begin
uString = Iconv.iconv('utf-8', @char_encoding, uString).first
@buffer << @raw_stream.read unless @raw_stream.eof?
@buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
rescue
@win1252 = true
end
@ -67,10 +73,8 @@ module HTML5lib
end
end
# Convert the unicode string into a list to be used as the data stream
@data_stream = uString
@queue = []
@errors = []
# Reset position in the list to read from
@tell = 0
@ -109,9 +113,22 @@ module HTML5lib
begin
require 'rubygems'
require 'UniversalDetector' # gem install chardet
buffer = @raw_stream.read
encoding = UniversalDetector::chardet(buffer)['encoding']
seek(buffer, 0)
buffers = []
detector = UniversalDetector::Detector.instance
detector.reset
until @raw_stream.eof?
buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
break if !buffer or buffer.empty?
buffers << buffer
detector.feed(buffer)
break if detector.instance_eval {@done}
detector.instance_eval {
@_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
}
end
detector.close
encoding = detector.result['encoding']
seek(buffers*'', 0)
rescue LoadError
end
end
@ -242,14 +259,20 @@ module HTML5lib
unless @queue.empty?
return @queue.shift
else
c = @data_stream[@tell]
if @tell + 3 > @buffer.length and !@raw_stream.eof?
# read next block
@buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
@tell = 0
end
c = @buffer[@tell]
@tell += 1
case c
when 0x01 .. 0x7F
if c == 0x0D
# normalize newlines
@tell += 1 if @data_stream[@tell] == 0x0A
@tell += 1 if @buffer[@tell] == 0x0A
c = 0x0A
end
@ -274,9 +297,9 @@ module HTML5lib
end
when 0xC0 .. 0xFF
if @win1252
if instance_variables.include?("@win1252") && @win1252
"\xC3" + (c-64).chr # convert to utf-8
elsif @data_stream[@tell-1 .. -1] =~ /^
elsif @buffer[@tell-1 .. @tell+3] =~ /^
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
@ -292,6 +315,8 @@ module HTML5lib
end
when 0x00
@errors.push('null character found in input stream, ' +
'replaced with U+FFFD')
[0xFFFD].pack('U') # null characters are invalid
else
@ -317,6 +342,10 @@ module HTML5lib
@queue.insert(0, c) unless c == :EOF
return char_stack.join('')
end
def unget(characters)
@queue.unshift(*characters.to_a) unless characters == :EOF
end
end
# String-like object with an assosiated position and various extra methods
@ -433,14 +462,14 @@ module HTML5lib
if attr[0] == 'charset'
tentative_encoding = attr[1]
if HTML5lib.is_valid_encoding(tentative_encoding)
if HTML5.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding
return false
end
elsif attr[0] == 'content'
content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
tentative_encoding = content_parser.parse
if HTML5lib.is_valid_encoding(tentative_encoding)
if HTML5.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding
return false
end

View file

@ -11,10 +11,10 @@
#
# @@TODO:
# * Selectively lowercase only XHTML, but not foreign markup
require 'html5lib/html5parser'
require 'html5lib/constants'
require 'html5/html5parser'
require 'html5/constants'
module HTML5lib
module HTML5
# liberal XML parser
class XMLParser < HTMLParser
@ -24,26 +24,36 @@ module HTML5lib
@phases[:initial] = XmlRootPhase.new(self, @tree)
end
def normalizeToken(token)
if token[:type] == :StartTag or token[:type] == :EmptyTag
def normalize_token(token)
case token[:type]
when :StartTag, :EmptyTag
# We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
# to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
token[:data] = Hash[*token[:data].reverse.flatten]
# For EmptyTags, process both a Start and an End tag
if token[:type] == :EmptyTag
save = @tokenizer.content_model_flag
@phase.processStartTag(token[:name], token[:data])
@tokenizer.content_model_flag = save
token[:data] = {}
token[:type] = :EndTag
end
elsif token[:type] == :EndTag
if token[:data]
parseError(_("End tag contains unexpected attributes."))
when :Characters
# un-escape RCDATA_ELEMENTS (e.g. style, script)
if @tokenizer.content_model_flag == :CDATA
token[:data] = token[:data].
gsub('&lt;','<').gsub('&gt;','>').gsub('&amp;','&')
end
elsif token[:type] == :Comment
when :EndTag
if token[:data]
parse_error(_("End tag contains unexpected attributes."))
end
when :Comment
# Rescue CDATA from the comments
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
token[:type] = :Characters
@ -64,22 +74,22 @@ module HTML5lib
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
end
def normalizeToken(token)
def normalize_token(token)
super(token)
# ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted
if token[:type] == :EndTag
if VOID_ELEMENTS.include? token[:name]
if @tree.openElements[-1].name != token["name"]:
if @tree.open_elements[-1].name != token["name"]:
token[:type] = :EmptyTag
token["data"] ||= {}
end
else
if token[:name] == @tree.openElements[-1].name and \
not @tree.openElements[-1].hasContent
if token[:name] == @tree.open_elements[-1].name and \
not @tree.open_elements[-1].hasContent
@tree.insertText('') unless
@tree.openElements.any? {|e|
@tree.open_elements.any? {|e|
e.attributes.keys.include? 'xmlns' and
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
}
@ -92,9 +102,9 @@ module HTML5lib
end
class XhmlRootPhase < RootElementPhase
def insertHtmlElement
def insert_html_element
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
@tree.openElements.push(element)
@tree.open_elements.push(element)
@tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead]
end
@ -105,15 +115,15 @@ module HTML5lib
@start_tag_handlers = Hash.new(:startTagOther)
@end_tag_handlers = Hash.new(:endTagOther)
def startTagOther(name, attributes)
@tree.openElements.push(@tree.document)
@tree.open_elements.push(@tree.document)
element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element)
@tree.openElements.push(element)
@tree.open_elements[-1].appendChild(element)
@tree.open_elements.push(element)
@parser.phase = XmlElementPhase.new(@parser,@tree)
end
def endTagOther(name)
super
@tree.openElements.pop
@tree.open_elements.pop
end
end
@ -125,17 +135,17 @@ module HTML5lib
def startTagOther(name, attributes)
element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element)
@tree.openElements.push(element)
@tree.open_elements[-1].appendChild(element)
@tree.open_elements.push(element)
end
def endTagOther(name)
for node in @tree.openElements.reverse
for node in @tree.open_elements.reverse
if node.name == name
{} while @tree.openElements.pop != node
{} while @tree.open_elements.pop != node
break
else
@parser.parseError
parse_error
end
end
end

View file

@ -1,6 +1,7 @@
require 'cgi'
require 'html5/tokenizer'
module HTML5lib
module HTML5
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes.
@ -12,11 +13,11 @@ module HTML5lib
# or, if you already have a parse tree (in this example, a REXML tree),
# at the Serializer stage:
#
# tokens = TreeWalkers.getTreeWalker('rexml').new(tree)
# tokens = TreeWalkers.get_tree_walker('rexml').new(tree)
# HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
# :sanitize => true})
module HTMLSanitizeModule
module HTMLSanitizeModule
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
button caption center cite code col colgroup dd del dfn dir div dl dt

View file

@ -0,0 +1,2 @@
require 'html5/serializer/htmlserializer'
require 'html5/serializer/xhtmlserializer'

View file

@ -1,6 +1,6 @@
require 'html5lib/constants'
require 'html5/constants'
module HTML5lib
module HTML5
class HTMLSerializer
@ -13,17 +13,18 @@ module HTML5lib
end
def initialize(options={})
@quote_attr_values = false
@quote_char = '"'
@use_best_quote_char = true
@quote_attr_values = false
@quote_char = '"'
@use_best_quote_char = true
@minimize_boolean_attributes = true
@use_trailing_solidus = false
@use_trailing_solidus = false
@space_before_trailing_solidus = true
@escape_lt_in_attrs = false
@escape_lt_in_attrs = false
@escape_rcdata = false
@omit_optional_tags = true
@sanitize = false
@sanitize = false
@strip_whitespace = false
@ -43,22 +44,22 @@ module HTML5lib
@errors = []
if encoding and @inject_meta_charset
require 'html5lib/filters/inject_meta_charset'
require 'html5/filters/inject_meta_charset'
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
end
if @strip_whitespace
require 'html5lib/filters/whitespace'
require 'html5/filters/whitespace'
treewalker = Filters::WhitespaceFilter.new(treewalker)
end
if @sanitize
require 'html5lib/filters/sanitizer'
require 'html5/filters/sanitizer'
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
end
if @omit_optional_tags
require 'html5lib/filters/optionaltags'
require 'html5/filters/optionaltags'
treewalker = Filters::OptionalTagFilter.new(treewalker)
end
@ -72,7 +73,7 @@ module HTML5lib
elsif [:Characters, :SpaceCharacters].include? type
if type == :SpaceCharacters or in_cdata
if in_cdata and token[:data].include?("</")
serializeError(_("Unexpected </ in CDATA"))
serialize_error(_("Unexpected </ in CDATA"))
end
result << token[:data]
else
@ -81,10 +82,10 @@ module HTML5lib
elsif [:StartTag, :EmptyTag].include? type
name = token[:name]
if RCDATA_ELEMENTS.include?(name)
if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata
in_cdata = true
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
serialize_error(_("Unexpected child element of a CDATA element"))
end
attributes = []
for k,v in attrs = token[:data].to_a.sort
@ -136,19 +137,19 @@ module HTML5lib
if RCDATA_ELEMENTS.include?(name)
in_cdata = false
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
serialize_error(_("Unexpected child element of a CDATA element"))
end
end_tag = "</#{name}>"
result << end_tag
elsif type == :Comment
data = token[:data]
serializeError(_("Comment contains --")) if data.index("--")
serialize_error(_("Comment contains --")) if data.index("--")
comment = "<!--%s-->" % token[:data]
result << comment
else
serializeError(token[:data])
serialize_error(token[:data])
end
end
@ -162,13 +163,15 @@ module HTML5lib
alias :render :serialize
def serializeError(data="XXX ERROR MESSAGE NEEDED")
def serialize_error(data="XXX ERROR MESSAGE NEEDED")
# XXX The idea is to make data mandatory.
@errors.push(data)
if @strict
raise SerializeError
end
end
def _(string); string; end
end
# Error in serialized tree

View file

@ -0,0 +1,20 @@
require 'html5/serializer/htmlserializer'
module HTML5
class XHTMLSerializer < HTMLSerializer
DEFAULTS = {
:quote_attr_values => true,
:minimize_boolean_attributes => false,
:use_trailing_solidus => true,
:escape_lt_in_attrs => true,
:omit_optional_tags => false,
:escape_rcdata => true
}
def initialize(options={})
super(DEFAULTS.clone.update(options))
end
end
end

View file

@ -0,0 +1,45 @@
module HTML5
module Sniffer
# 4.7.4
def html_or_feed str
s = str[0, 512] # steps 1, 2
pos = 0
while pos < s.length
case s[pos]
when 0x09, 0x20, 0x0A, 0x0D # tab, space, LF, CR
pos += 1
when 0x3C # "<"
pos += 1
if s[pos..pos+2] == "!--" # [0x21, 0x2D, 0x2D]
pos += 3
until s[pos..pos+2] == "-->" or pos >= s.length
pos += 1
end
pos += 3
elsif s[pos] == 0x21 # "!"
pos += 1
until s[pos] == 0x3E or pos >= s.length # ">"
pos += 1
end
pos += 1
elsif s[pos] == 0x3F # "?"
until s[pos..pos+1] == "?>" or pos >= s.length # [0x3F, 0x3E]
pos += 1
end
pos += 2
elsif s[pos..pos+2] == "rss" # [0x72, 0x73, 0x73]
return "application/rss+xml"
elsif s[pos..pos+3] == "feed" # [0x66, 0x65, 0x65, 0x64]
return "application/atom+xml"
elsif s[pos..pos+6] == "rdf:RDF" # [0x72, 0x64, 0x66, 0x3A, 0x52, 0x44, 0x46]
raise NotImplementedError
end
else
break
end
end
"text/html"
end
end
end

View file

@ -0,0 +1,968 @@
require 'html5/constants'
require 'html5/inputstream'
module HTML5
# This class takes care of tokenizing HTML.
#
# * @current_token
# Holds the token that is currently being processed.
#
# * @state
# Holds a reference to the method to be invoked... XXX
#
# * @states
# Holds a mapping between states and methods that implement the state.
#
# * @stream
# Points to HTMLInputStream object.
class HTMLTokenizer
attr_accessor :content_model_flag, :current_token
attr_reader :stream
# XXX need to fix documentation
def initialize(stream, options = {})
@stream = HTMLInputStream.new(stream, options)
# Setup the initial tokenizer state
@content_model_flag = :PCDATA
@state = :data_state
@escapeFlag = false
@lastFourChars = []
# The current token being created
@current_token = nil
# Tokens to be processed.
@token_queue = []
@lowercase_element_name = options[:lowercase_element_name] != false
@lowercase_attr_name = options[:lowercase_attr_name] != false
end
# This is where the magic happens.
#
# We do our usually processing through the states and when we have a token
# to return we yield the token which pauses processing until the next token
# is requested.
def each
@token_queue = []
# Start processing. When EOF is reached @state will return false
# instead of true and the loop will terminate.
while send @state
yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
yield @token_queue.shift until @token_queue.empty?
end
end
# Below are various helper functions the tokenizer states use worked out.
# If the next character is a '>', convert the current_token into
# an EmptyTag
def process_solidus_in_tag
# We need to consume another character to make sure it's a ">"
data = @stream.char
if @current_token[:type] == :StartTag and data == ">"
@current_token[:type] = :EmptyTag
else
@token_queue << {:type => :ParseError, :data => _("Solidus (/) incorrectly placed in tag.")}
end
# The character we just consumed need to be put back on the stack so it
# doesn't get lost...
@stream.unget(data)
end
# This function returns either U+FFFD or the character based on the
# decimal or hexadecimal representation. It also discards ";" if present.
# If not present @token_queue << {:type => :ParseError}" is invoked.
def consume_number_entity(isHex)
# XXX More need to be done here. For instance, #13 should prolly be
# converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
# such. Thoughts on this appreciated.
allowed = DIGITS
radix = 10
if isHex
allowed = HEX_DIGITS
radix = 16
end
char_stack = []
# Consume all the characters that are in range while making sure we
# don't hit an EOF.
c = @stream.char
while allowed.include?(c) and c != :EOF
char_stack.push(c)
c = @stream.char
end
# Convert the set of characters consumed to an int.
charAsInt = char_stack.join('').to_i(radix)
if charAsInt == 13
@token_queue << {:type => :ParseError, :data => _("Incorrect CR newline entity. Replaced with LF.")}
charAsInt = 10
elsif (128..159).include? charAsInt
# If the integer is between 127 and 160 (so 128 and bigger and 159
# and smaller) we need to do the "windows trick".
@token_queue << {:type => :ParseError, :data => _("Entity used with illegal number (windows-1252 reference).")}
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
end
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
char = [charAsInt].pack('U')
else
char = [0xFFFD].pack('U')
@token_queue << {:type => :ParseError, :data => _("Numeric entity represents an illegal codepoint.")}
end
# Discard the ; if present. Otherwise, put it back on the queue and
# invoke parse_error on parser.
if c != ";"
@token_queue << {:type => :ParseError, :data => _("Numeric entity didn't end with ';'.")}
@stream.unget(c)
end
return char
end
def consume_entity(from_attribute=false)
char = nil
char_stack = [@stream.char]
if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0])
@stream.unget(char_stack)
elsif char_stack[0] == '#'
# We might have a number entity here.
char_stack += [@stream.char, @stream.char]
if char_stack[0 .. 1].include? :EOF
# If we reach the end of the file put everything up to :EOF
# back in the queue
char_stack = char_stack[0...char_stack.index(:EOF)]
@stream.unget(char_stack)
@token_queue << {:type => :ParseError, :data => _("Numeric entity expected. Got end of file instead.")}
else
if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
# Hexadecimal entity detected.
@stream.unget(char_stack[2])
char = consume_number_entity(true)
elsif DIGITS.include? char_stack[1]
# Decimal entity detected.
@stream.unget(char_stack[1..-1])
char = consume_number_entity(false)
else
# No number entity detected.
@stream.unget(char_stack)
@token_queue << {:type => :ParseError, :data => _("Numeric entity expected but none found.")}
end
end
else
# At this point in the process might have named entity. Entities
# are stored in the global variable "entities".
#
# Consume characters and compare to these to a substring of the
# entity names in the list until the substring no longer matches.
filteredEntityList = ENTITIES.keys
filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
entityName = nil
# Try to find the longest entity the string will match to take care
# of &noti for instance.
while char_stack.last != :EOF
name = char_stack.join('')
if filteredEntityList.any? {|e| e[0...name.length] == name}
filteredEntityList.reject! {|e| e[0...name.length] != name}
char_stack.push(@stream.char)
else
break
end
if ENTITIES.include? name
entityName = name
break if entityName[-1] == ';'
end
end
if entityName != nil
char = ENTITIES[entityName]
# Check whether or not the last character returned can be
# discarded or needs to be put back.
if entityName[-1] != ?;
@token_queue << {:type => :ParseError, :data => _("Named entity didn't end with ';'.")}
end
if char_stack[-1] != ";" and from_attribute and
(ASCII_LETTERS.include?(char_stack[entityName.length]) or
DIGITS.include?(char_stack[entityName.length]))
@stream.unget(char_stack)
char = '&'
else
@stream.unget(char_stack[entityName.length..-1])
end
else
@token_queue << {:type => :ParseError, :data => _("Named entity expected. Got none.")}
@stream.unget(char_stack)
end
end
return char
end
# This method replaces the need for "entityInAttributeValueState".
def process_entity_in_attribute
entity = consume_entity(true)
if entity
@current_token[:data][-1][1] += entity
else
@current_token[:data][-1][1] += "&"
end
end
# This method is a generic handler for emitting the tags. It also sets
# the state to "data" because that's what's needed after a token has been
# emitted.
def emit_current_token
# Add token to the queue to be yielded
token = @current_token
if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
if @lowercase_element_name
token[:name] = token[:name].downcase
end
@token_queue << token
@state = :data_state
end
end
# Below are the various tokenizer states worked out.
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
# documents to figure out what the order of the various if and elsif
# statements should be.
def data_state
data = @stream.char
if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
@lastFourChars << data
@lastFourChars.shift if @lastFourChars.length > 4
end
if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
@state = :entity_data_state
elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
@escapeFlag = true
@token_queue << {:type => :Characters, :data => data}
elsif data == "<" and !@escapeFlag and
[:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
@state = :tag_open_state
elsif data == ">" and @escapeFlag and
[:CDATA,:RCDATA].include?(@content_model_flag) and
@lastFourChars[1..-1].join('') == "-->"
@escapeFlag = false
@token_queue << {:type => :Characters, :data => data}
elsif data == :EOF
# Tokenization ends.
return false
elsif SPACE_CHARACTERS.include? data
# Directly after emitting a token you switch back to the "data
# state". At that point SPACE_CHARACTERS are important so they are
# emitted separately.
# XXX need to check if we don't need a special "spaces" flag on
# characters.
@token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
else
@token_queue << {:type => :Characters, :data => data + @stream.chars_until(%w[& < > -])}
end
return true
end
def entity_data_state
entity = consume_entity
if entity
@token_queue << {:type => :Characters, :data => entity}
else
@token_queue << {:type => :Characters, :data => "&"}
end
@state = :data_state
return true
end
def tag_open_state
data = @stream.char
if @content_model_flag == :PCDATA
if data == "!"
@state = :markup_declaration_open_state
elsif data == "/"
@state = :close_tag_open_state
elsif data != :EOF and ASCII_LETTERS.include? data
@current_token = {:type => :StartTag, :name => data, :data => []}
@state = :tag_name_state
elsif data == ">"
# XXX In theory it could be something besides a tag name. But
# do we really care?
@token_queue << {:type => :ParseError, :data => _("Expected tag name. Got '>' instead.")}
@token_queue << {:type => :Characters, :data => "<>"}
@state = :data_state
elsif data == "?"
# XXX In theory it could be something besides a tag name. But
# do we really care?
@token_queue.push({:type => :ParseError, :data => _("Expected tag name. Got '?' instead (HTML doesn't " +
"support processing instructions).")})
@stream.unget(data)
@state = :bogus_comment_state
else
# XXX
@token_queue << {:type => :ParseError, :data => _("Expected tag name. Got something else instead")}
@token_queue << {:type => :Characters, :data => "<"}
@stream.unget(data)
@state = :data_state
end
else
# We know the content model flag is set to either RCDATA or CDATA
# now because this state can never be entered with the PLAINTEXT
# flag.
if data == "/"
@state = :close_tag_open_state
else
@token_queue << {:type => :Characters, :data => "<"}
@stream.unget(data)
@state = :data_state
end
end
return true
end
def close_tag_open_state
if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
if @current_token
char_stack = []
# So far we know that "</" has been consumed. We now need to know
# whether the next few characters match the name of last emitted
# start tag which also happens to be the current_token. We also need
# to have the character directly after the characters that could
# match the start tag name.
(@current_token[:name].length + 1).times do
char_stack.push(@stream.char)
# Make sure we don't get hit by :EOF
break if char_stack[-1] == :EOF
end
# Since this is just for checking. We put the characters back on
# the stack.
@stream.unget(char_stack)
end
if @current_token and
@current_token[:name].downcase ==
char_stack[0...-1].join('').downcase and
(SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
# Because the characters are correct we can safely switch to
# PCDATA mode now. This also means we don't have to do it when
# emitting the end tag token.
@content_model_flag = :PCDATA
else
@token_queue << {:type => :Characters, :data => "</"}
@state = :data_state
# Need to return here since we don't want the rest of the
# method to be walked through.
return true
end
end
data = @stream.char
if data == :EOF
@token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected end of file.")}
@token_queue << {:type => :Characters, :data => "</"}
@state = :data_state
elsif ASCII_LETTERS.include? data
@current_token = {:type => :EndTag, :name => data, :data => []}
@state = :tag_name_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => _("Expected closing tag. Got '>' instead. Ignoring '</>'.")}
@state = :data_state
else
# XXX data can be _'_...
@token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected character '#{data}' found.")}
@stream.unget(data)
@state = :bogus_comment_state
end
return true
end
def tag_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = :before_attribute_name_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in the tag name.")}
emit_current_token
elsif ASCII_LETTERS.include? data
@current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
elsif data == ">"
emit_current_token
elsif data == "/"
process_solidus_in_tag
@state = :before_attribute_name_state
else
@current_token[:name] += data
end
return true
end
def before_attribute_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.chars_until(SPACE_CHARACTERS, true)
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute name instead.")}
emit_current_token
elsif ASCII_LETTERS.include? data
@current_token[:data].push([data, ""])
@state = :attribute_name_state
elsif data == ">"
emit_current_token
elsif data == "/"
process_solidus_in_tag
else
@current_token[:data].push([data, ""])
@state = :attribute_name_state
end
return true
end
def attribute_name_state
data = @stream.char
leavingThisState = true
emitToken = false
if data == "="
@state = :before_attribute_value_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute name.")}
@state = :data_state
emitToken = true
elsif ASCII_LETTERS.include? data
@current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
leavingThisState = false
elsif data == ">"
# XXX If we emit here the attributes are converted to a dict
# without being checked and when the code below runs we error
# because data is a dict not a list
emitToken = true
elsif SPACE_CHARACTERS.include? data
@state = :after_attribute_name_state
elsif data == "/"
process_solidus_in_tag
@state = :before_attribute_name_state
else
@current_token[:data][-1][0] += data
leavingThisState = false
end
if leavingThisState
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
if @lowercase_attr_name
@current_token[:data][-1][0] = @current_token[:data].last.first.downcase
end
@current_token[:data][0...-1].each {|name,value|
if @current_token[:data].last.first == name
@token_queue << {:type => :ParseError, :data =>_("Dropped duplicate attribute on tag.")}
break # don't report an error more than once
end
}
# XXX Fix for above XXX
emit_current_token if emitToken
end
return true
end
def after_attribute_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.chars_until(SPACE_CHARACTERS, true)
elsif data == "="
@state = :before_attribute_value_state
elsif data == ">"
emit_current_token
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected = or end of tag.")}
emit_current_token
elsif ASCII_LETTERS.include? data
@current_token[:data].push([data, ""])
@state = :attribute_name_state
elsif data == "/"
process_solidus_in_tag
@state = :before_attribute_name_state
else
@current_token[:data].push([data, ""])
@state = :attribute_name_state
end
return true
end
def before_attribute_value_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@stream.chars_until(SPACE_CHARACTERS, true)
elsif data == "\""
@state = :attribute_value_double_quoted_state
elsif data == "&"
@state = :attribute_value_unquoted_state
@stream.unget(data);
elsif data == "'"
@state = :attribute_value_single_quoted_state
elsif data == ">"
emit_current_token
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute value.")}
emit_current_token
else
@current_token[:data][-1][1] += data
@state = :attribute_value_unquoted_state
end
return true
end
def attribute_value_double_quoted_state
data = @stream.char
if data == "\""
@state = :before_attribute_name_state
elsif data == "&"
process_entity_in_attribute
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (\").")}
emit_current_token
else
@current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
end
return true
end
def attribute_value_single_quoted_state
data = @stream.char
if data == "'"
@state = :before_attribute_name_state
elsif data == "&"
process_entity_in_attribute
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (').")}
emit_current_token
else
@current_token[:data][-1][1] += data +\
@stream.chars_until(["'", "&"])
end
return true
end
def attribute_value_unquoted_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = :before_attribute_name_state
elsif data == "&"
process_entity_in_attribute
elsif data == ">"
emit_current_token
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value.")}
emit_current_token
else
@current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
end
return true
end
def bogus_comment_state
# Make a new comment token and give it as value all the characters
# until the first > or :EOF (chars_until checks for :EOF automatically)
# and emit it.
@token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}
# Eat the character directly after the bogus comment which is either a
# ">" or an :EOF.
@stream.char
@state = :data_state
return true
end
def markup_declaration_open_state
char_stack = [@stream.char, @stream.char]
if char_stack == ["-", "-"]
@current_token = {:type => :Comment, :data => ""}
@state = :comment_start_state
else
5.times { char_stack.push(@stream.char) }
# Put in explicit :EOF check
if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
@current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
@state = :doctype_state
else
@token_queue << {:type => :ParseError, :data => _("Expected '--' or 'DOCTYPE'. Not found.")}
@stream.unget(char_stack)
@state = :bogus_comment_state
end
end
return true
end
def comment_start_state
data = @stream.char
if data == "-"
@state = :comment_start_dash_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
@token_queue << @current_token
@state = :data_state
else
@current_token[:data] += data + @stream.chars_until("-")
@state = :comment_state
end
return true
end
def comment_start_dash_state
data = @stream.char
if data == "-"
@state = :comment_end_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
@token_queue << @current_token
@state = :data_state
else
@current_token[:data] += '-' + data + @stream.chars_until("-")
@state = :comment_state
end
return true
end
def comment_state
data = @stream.char
if data == "-"
@state = :comment_end_dash_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
@token_queue << @current_token
@state = :data_state
else
@current_token[:data] += data + @stream.chars_until("-")
end
return true
end
def comment_end_dash_state
data = @stream.char
if data == "-"
@state = :comment_end_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (-)")}
@token_queue << @current_token
@state = :data_state
else
@current_token[:data] += "-" + data +\
@stream.chars_until("-")
# Consume the next character which is either a "-" or an :EOF as
# well so if there's a "-" directly after the "-" we go nicely to
# the "comment end state" without emitting a ParseError there.
@stream.char
end
return true
end
def comment_end_state
data = @stream.char
if data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == "-"
@token_queue << {:type => :ParseError, :data => _("Unexpected '-' after '--' found in comment.")}
@current_token[:data] += data
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (--).")}
@token_queue << @current_token
@state = :data_state
else
# XXX
@token_queue << {:type => :ParseError, :data => _("Unexpected character in comment found.")}
@current_token[:data] += "--" + data
@state = :comment_state
end
return true
end
def doctype_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = :before_doctype_name_state
else
@token_queue << {:type => :ParseError, :data => _("No space after literal string 'DOCTYPE'.")}
@stream.unget(data)
@state = :before_doctype_name_state
end
return true
end
def before_doctype_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
elsif data == ">"
@token_queue << {:type => :ParseError, :data => _("Unexpected > character. Expected DOCTYPE name.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected DOCTYPE name.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:name] = data
@state = :doctype_name_state
end
return true
end
def doctype_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
@state = :after_doctype_name_state
elsif data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE name.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:name] += data
end
return true
end
def after_doctype_name_state
data = @stream.char
if SPACE_CHARACTERS.include? data
elsif data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@current_token[:correct] = false
@stream.unget(data)
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@token_queue << @current_token
@state = :data_state
else
char_stack = [data]
5.times { char_stack << stream.char }
token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
if token == "public" and !char_stack.include?(:EOF)
@state = :before_doctype_public_identifier_state
elsif token == "system" and !char_stack.include?(:EOF)
@state = :before_doctype_system_identifier_state
else
@stream.unget(char_stack)
@token_queue << {:type => :ParseError, :data => _("Expected 'public' or 'system'. Got '#{token}'")}
@state = :bogus_doctype_state
end
end
return true
end
def before_doctype_public_identifier_state
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@current_token[:publicId] = ""
@state = :doctype_public_identifier_double_quoted_state
elsif data == "'"
@current_token[:publicId] = ""
@state = :doctype_public_identifier_single_quoted_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => _("Unexpected end of DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
@state = :bogus_doctype_state
end
return true
end
def doctype_public_identifier_double_quoted_state
data = @stream.char
if data == "\""
@state = :after_doctype_public_identifier_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:publicId] += data
end
return true
end
def doctype_public_identifier_single_quoted_state
data = @stream.char
if data == "'"
@state = :after_doctype_public_identifier_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:publicId] += data
end
return true
end
def after_doctype_public_identifier_state
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@current_token[:systemId] = ""
@state = :doctype_system_identifier_double_quoted_state
elsif data == "'"
@current_token[:systemId] = ""
@state = :doctype_system_identifier_single_quoted_state
elsif data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
@state = :bogus_doctype_state
end
return true
end
def before_doctype_system_identifier_state
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == "\""
@current_token[:systemId] = ""
@state = :doctype_system_identifier_double_quoted_state
elsif data == "'"
@current_token[:systemId] = ""
@state = :doctype_system_identifier_single_quoted_state
elsif data == ">"
@token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
@state = :bogus_doctype_state
end
return true
end
def doctype_system_identifier_double_quoted_state
data = @stream.char
if data == "\""
@state = :after_doctype_system_identifier_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:systemId] += data
end
return true
end
def doctype_system_identifier_single_quoted_state
data = @stream.char
if data == "'"
@state = :after_doctype_system_identifier_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@current_token[:systemId] += data
end
return true
end
def after_doctype_system_identifier_state
data = @stream.char
if SPACE_CHARACTERS.include?(data)
elsif data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
else
@token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
@state = :bogus_doctype_state
end
return true
end
def bogus_doctype_state
data = @stream.char
@current_token[:correct] = false
if data == ">"
@token_queue << @current_token
@state = :data_state
elsif data == :EOF
# XXX EMIT
@stream.unget(data)
@token_queue << {:type => :ParseError, :data => _("Unexpected end of file in bogus doctype.")}
@current_token[:correct] = false
@token_queue << @current_token
@state = :data_state
end
return true
end
def _(string); string; end
end
end

View file

@ -1,24 +1,24 @@
module HTML5lib
module HTML5
module TreeBuilders
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treebuilders/simpletree'
require 'html5/treebuilders/simpletree'
SimpleTree::TreeBuilder
when 'rexml' then
require 'html5lib/treebuilders/rexml'
require 'html5/treebuilders/rexml'
REXML::TreeBuilder
when 'hpricot' then
require 'html5lib/treebuilders/hpricot'
require 'html5/treebuilders/hpricot'
Hpricot::TreeBuilder
else
raise "Unknown TreeBuilder #{name}"
end
end
alias :getTreeBuilder :[]
alias :get_tree_builder :[]
end
end
end

View file

@ -1,8 +1,8 @@
require 'html5lib/constants'
require 'html5/constants'
#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
module HTML5lib
module HTML5
# The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
@ -24,9 +24,9 @@ module HTML5lib
attr_accessor :_flags
def initialize(name)
@parent = nil
@parent = nil
@childNodes = []
@_flags = []
@_flags = []
end
# Insert node as a child of the current node
@ -76,13 +76,13 @@ module HTML5lib
# Base treebuilder implementation
class TreeBuilder
attr_accessor :openElements
attr_accessor :open_elements
attr_accessor :activeFormattingElements
attr_accessor :document
attr_accessor :headPointer
attr_accessor :head_pointer
attr_accessor :formPointer
@ -106,25 +106,25 @@ module HTML5lib
end
def reset
@openElements = []
@open_elements = []
@activeFormattingElements = []
#XXX - rename these to headElement, formElement
@headPointer = nil
@head_pointer = nil
@formPointer = nil
self.insertFromTable = false
self.insert_from_table = false
@document = @documentClass.new
end
def elementInScope(target, tableVariant=false)
# Exit early when possible.
return true if @openElements[-1].name == target
return true if @open_elements[-1].name == target
# AT How about while true and simply set node to [-1] and set it to
# [-2] at the end...
@openElements.reverse.each do |element|
@open_elements.reverse.each do |element|
if element.name == target
return true
elsif element.name == 'table'
@ -149,10 +149,10 @@ module HTML5lib
# Step 2 and step 3: we start with the last element. So i is -1.
i = -1
entry = @activeFormattingElements[i]
return if entry == Marker or @openElements.include?(entry)
return if entry == Marker or @open_elements.include?(entry)
# Step 6
until entry == Marker or @openElements.include?(entry)
until entry == Marker or @open_elements.include?(entry)
# Step 5: let entry be one earlier in the list.
i -= 1
begin
@ -171,7 +171,7 @@ module HTML5lib
clone = @activeFormattingElements[i].cloneNode
# Step 9
element = insertElement(clone.name, clone.attributes)
element = insert_element(clone.name, clone.attributes)
# Step 10
@activeFormattingElements[i] = element
@ -198,12 +198,15 @@ module HTML5lib
return false
end
def insertDoctype(name)
@document.appendChild(@doctypeClass.new(name))
def insertDoctype(name, public_id, system_id)
doctype = @doctypeClass.new(name)
doctype.public_id = public_id
doctype.system_id = system_id
@document.appendChild(doctype)
end
def insertComment(data, parent=nil)
parent = @openElements[-1] if parent.nil?
def insert_comment(data, parent=nil)
parent = @open_elements[-1] if parent.nil?
parent.appendChild(@commentClass.new(data))
end
@ -216,28 +219,28 @@ module HTML5lib
# Switch the function used to insert an element from the
# normal one to the misnested table one and back again
def insertFromTable=(value)
@insertFromTable = value
@insertElement = value ? :insertElementTable : :insertElementNormal
def insert_from_table=(value)
@insert_from_table = value
@insert_element = value ? :insert_elementTable : :insert_elementNormal
end
def insertElement(name, attributes)
send(@insertElement, name, attributes)
def insert_element(name, attributes)
send(@insert_element, name, attributes)
end
def insertElementNormal(name, attributes)
def insert_elementNormal(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
@openElements[-1].appendChild(element)
@openElements.push(element)
@open_elements.last.appendChild(element)
@open_elements.push(element)
return element
end
# Create an element and insert it into the tree
def insertElementTable(name, attributes)
def insert_elementTable(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
if TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)
if TABLE_INSERT_MODE_ELEMENTS.include?(@open_elements.last.name)
#We should be in the InTable mode. This means we want to do
#special magic element rearranging
parent, insertBefore = getTableMisnestedNodePosition
@ -246,17 +249,17 @@ module HTML5lib
else
parent.insertBefore(element, insertBefore)
end
@openElements.push(element)
@open_elements.push(element)
else
return insertElementNormal(name, attributes)
return insert_elementNormal(name, attributes)
end
return element
end
def insertText(data, parent=nil)
parent = @openElements[-1] if parent.nil?
parent = @open_elements[-1] if parent.nil?
if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)))
if (not(@insert_from_table) or (@insert_from_table and not TABLE_INSERT_MODE_ELEMENTS.include?(@open_elements[-1].name)))
parent.insertText(data)
else
#We should be in the InTable mode. This means we want to do
@ -265,7 +268,7 @@ module HTML5lib
parent.insertText(data, insertBefore)
end
end
# Get the foster parent element, and sibling to insert before
# (or nil) when inserting a misnested table node
def getTableMisnestedNodePosition
@ -275,7 +278,7 @@ module HTML5lib
lastTable = nil
fosterParent = nil
insertBefore = nil
@openElements.reverse.each do |element|
@open_elements.reverse.each do |element|
if element.name == "table"
lastTable = element
break
@ -288,33 +291,34 @@ module HTML5lib
fosterParent = lastTable.parent
insertBefore = lastTable
else
fosterParent = @openElements[@openElements.index(lastTable) - 1]
fosterParent = @open_elements[@open_elements.index(lastTable) - 1]
end
else
fosterParent = @openElements[0]
fosterParent = @open_elements[0]
end
return fosterParent, insertBefore
end
def generateImpliedEndTags(exclude=nil)
name = @openElements[-1].name
name = @open_elements[-1].name
if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude)
@openElements.pop
# XXX td, th and tr are not actually needed
if (%w[dd dt li p td th tr].include?(name) and name != exclude)
@open_elements.pop
# XXX This is not entirely what the specification says. We should
# investigate it more closely.
generateImpliedEndTags(exclude)
end
end
def getDocument
def get_document
@document
end
def getFragment
#assert @innerHTML
def get_fragment
#assert @inner_html
fragment = @fragmentClass.new
@openElements[0].reparentChildren(fragment)
@open_elements[0].reparentChildren(fragment)
return fragment
end

View file

@ -1,221 +1,231 @@
require 'html5lib/treebuilders/base'
require 'rubygems'
require 'hpricot'
require 'forwardable'
module HTML5lib
module TreeBuilders
module Hpricot
class Node < Base::Node
extend Forwardable
def_delegators :@hpricot, :name
attr_accessor :hpricot
def initialize(name)
super(name)
@hpricot = self.class.hpricot_class.new name
end
def appendChild(node)
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
else
childNodes << node
hpricot.children << node.hpricot
end
if (oldparent = node.hpricot.parent) != nil
oldparent.children.delete_at(oldparent.children.index(node.hpricot))
end
node.hpricot.parent = hpricot
node.parent = self
end
def removeChild(node)
childNodes.delete(node)
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
node.hpricot.parent = nil
node.parent = nil
end
def insertText(data, before=nil)
if before
insertBefore(TextNode.new(data), before)
else
appendChild(TextNode.new(data))
end
end
def insertBefore(node, refNode)
index = childNodes.index(refNode)
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
else
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
childNodes.insert(index, node)
end
end
def hasContent
childNodes.any?
end
end
class Element < Node
def self.hpricot_class
::Hpricot::Elem
end
def initialize(name)
super(name)
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
end
def name
@hpricot.stag.name
end
def cloneNode
attributes.inject(self.class.new(name)) do |node, (name, value)|
node.hpricot[name] = value
node
end
end
# A call to Hpricot::Elem#raw_attributes is built dynamically,
# so alterations to the returned value (a hash) will be lost.
#
# AttributeProxy works around this by forwarding :[]= calls
# to the raw_attributes accessor on the element start tag.
#
class AttributeProxy
def initialize(hpricot)
@hpricot = hpricot
end
def []=(k, v)
@hpricot.stag.send(stag_attributes_method)[k] = v
end
def stag_attributes_method
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
end
def method_missing(*a, &b)
@hpricot.attributes.send(*a, &b)
end
end
def attributes
AttributeProxy.new(@hpricot)
end
def attributes=(attrs)
attrs.each { |name, value| @hpricot[name] = value }
end
def printTree(indent=0)
tree = "\n|#{' ' * indent}<#{name}>"
indent += 2
attributes.each do |name, value|
next if name == 'xmlns'
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
end
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
end
end
class Document < Node
def self.hpricot_class
::Hpricot::Doc
end
def initialize
super(nil)
end
def printTree(indent=0)
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
end
end
class DocumentType < Node
def self.hpricot_class
::Hpricot::DocType
end
def initialize(name)
begin
super(name)
rescue ArgumentError # needs 3...
end
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
end
def printTree(indent=0)
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
end
end
class DocumentFragment < Element
def initialize
super('')
end
def printTree(indent=0)
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
end
end
class TextNode < Node
def initialize(data)
@hpricot = ::Hpricot::Text.new(data)
end
def printTree(indent=0)
"\n|#{' ' * indent}\"#{hpricot.content}\""
end
end
class CommentNode < Node
def self.hpricot_class
::Hpricot::Comment
end
def printTree(indent=0)
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer(node)
node.printTree
end
def getDocument
@document.hpricot
end
def getFragment
@document = super
return @document.hpricot.children
end
end
end
end
end
require 'html5/treebuilders/base'
require 'rubygems'
require 'hpricot'
require 'forwardable'
module HTML5
module TreeBuilders
module Hpricot
class Node < Base::Node
extend Forwardable
def_delegators :@hpricot, :name
attr_accessor :hpricot
def initialize(name)
super(name)
@hpricot = self.class.hpricot_class.new name
end
def appendChild(node)
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
childNodes.last.hpricot.content = childNodes.last.hpricot.content + node.hpricot.content
else
childNodes << node
hpricot.children << node.hpricot
end
if (oldparent = node.hpricot.parent) != nil
oldparent.children.delete_at(oldparent.children.index(node.hpricot))
end
node.hpricot.parent = hpricot
node.parent = self
end
def removeChild(node)
childNodes.delete(node)
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
node.hpricot.parent = nil
node.parent = nil
end
def insertText(data, before=nil)
if before
insertBefore(TextNode.new(data), before)
else
appendChild(TextNode.new(data))
end
end
def insertBefore(node, refNode)
index = childNodes.index(refNode)
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
else
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
childNodes.insert(index, node)
end
end
def hasContent
childNodes.any?
end
end
class Element < Node
def self.hpricot_class
::Hpricot::Elem
end
def initialize(name)
super(name)
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
end
def name
@hpricot.stag.name
end
def cloneNode
attributes.inject(self.class.new(name)) do |node, (name, value)|
node.hpricot[name] = value
node
end
end
# A call to Hpricot::Elem#raw_attributes is built dynamically,
# so alterations to the returned value (a hash) will be lost.
#
# AttributeProxy works around this by forwarding :[]= calls
# to the raw_attributes accessor on the element start tag.
#
class AttributeProxy
def initialize(hpricot)
@hpricot = hpricot
end
def []=(k, v)
@hpricot.stag.send(stag_attributes_method)[k] = v
end
def stag_attributes_method
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
end
def method_missing(*a, &b)
@hpricot.attributes.send(*a, &b)
end
end
def attributes
AttributeProxy.new(@hpricot)
end
def attributes=(attrs)
attrs.each { |name, value| @hpricot[name] = value }
end
def printTree(indent=0)
tree = "\n|#{' ' * indent}<#{name}>"
indent += 2
attributes.each do |name, value|
next if name == 'xmlns'
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
end
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
end
end
class Document < Node
def self.hpricot_class
::Hpricot::Doc
end
def initialize
super(nil)
end
def printTree(indent=0)
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
end
end
class DocumentType < Node
def_delegators :@hpricot, :public_id, :system_id
def self.hpricot_class
::Hpricot::DocType
end
def initialize(name, public_id, system_id)
begin
super(name)
rescue ArgumentError # needs 3...
end
@hpricot = ::Hpricot::DocType.new(name, public_id, system_id)
end
def printTree(indent=0)
if hpricot.target and hpricot.target.any?
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
else
"\n|#{' ' * indent}<!DOCTYPE >"
end
end
end
class DocumentFragment < Element
def initialize
super('')
end
def printTree(indent=0)
childNodes.inject('') {|tree, child| tree + child.printTree(indent + 2) }
end
end
class TextNode < Node
def initialize(data)
@hpricot = ::Hpricot::Text.new(data)
end
def printTree(indent=0)
"\n|#{' ' * indent}\"#{hpricot.content}\""
end
end
class CommentNode < Node
def self.hpricot_class
::Hpricot::Comment
end
def printTree(indent=0)
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def insertDoctype(name, public_id, system_id)
doctype = @doctypeClass.new(name, public_id, system_id)
@document.appendChild(doctype)
end
def testSerializer(node)
node.printTree
end
def get_document
@document.hpricot
end
def get_fragment
@document = super
return @document.hpricot.children
end
end
end
end
end

View file

@ -1,8 +1,8 @@
require 'html5lib/treebuilders/base'
require 'html5/treebuilders/base'
require 'rexml/document'
require 'forwardable'
module HTML5lib
module HTML5
module TreeBuilders
module REXML
@ -17,11 +17,9 @@ module HTML5lib
end
def appendChild node
if node.kind_of? TextNode and
childNodes.length>0 and childNodes[-1].kind_of? TextNode
childNodes[-1].rxobj.value =
childNodes[-1].rxobj.to_s + node.rxobj.to_s
childNodes[-1].rxobj.raw = true
if node.kind_of?(TextNode) && childNodes.length > 0 && childNodes.last.kind_of?(TextNode)
childNodes.last.rxobj.value = childNodes.last.rxobj.to_s + node.rxobj.to_s
childNodes.last.rxobj.raw = true
else
childNodes.push node
rxobj.add node.rxobj
@ -45,10 +43,8 @@ module HTML5lib
def insertBefore node, refNode
index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and
childNodes[index-1].kind_of? TextNode
childNodes[index-1].rxobj.value =
childNodes[index-1].rxobj.to_s + node.rxobj.to_s
if node.kind_of?(TextNode) and index > 0 && childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].rxobj.value = childNodes[index-1].rxobj.to_s + node.rxobj.to_s
childNodes[index-1].rxobj.raw = true
else
childNodes.insert index, node
@ -57,7 +53,7 @@ module HTML5lib
end
def hasContent
return (childNodes.length > 0)
(childNodes.length > 0)
end
end
@ -77,7 +73,7 @@ module HTML5lib
end
def attributes= value
value.each {|name, value| rxobj.attributes[name]=value}
value.each {|name, value| rxobj.attributes[name] = value}
end
def printTree indent=0
@ -90,7 +86,7 @@ module HTML5lib
for child in childNodes
tree += child.printTree(indent)
end
return tree
tree
end
end
@ -120,10 +116,25 @@ module HTML5lib
end
class DocumentType < Node
def_delegator :@rxobj, :public, :public_id
def_delegator :@rxobj, :system, :system_id
def self.rxclass
::REXML::DocType
end
def initialize name, public_id, system_id
super(name)
if public_id
@rxobj = ::REXML::DocType.new [name, ::REXML::DocType::PUBLIC, public_id, system_id]
elsif system_id
@rxobj = ::REXML::DocType.new [name, ::REXML::DocType::SYSTEM, nil, system_id]
else
@rxobj = ::REXML::DocType.new name
end
end
def printTree indent=0
"\n|#{' ' * indent}<!DOCTYPE #{name}>"
end
@ -145,7 +156,7 @@ module HTML5lib
class TextNode < Node
def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
raw = data.gsub('&', '&amp;').gsub('<', '&lt;').gsub('>', '&gt;')
@rxobj = ::REXML::Text.new(raw, true, nil, true)
end
@ -167,21 +178,26 @@ module HTML5lib
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
def insertDoctype(name, public_id, system_id)
doctype = @doctypeClass.new(name, public_id, system_id)
@document.appendChild(doctype)
end
def getDocument
def testSerializer node
node.printTree
end
def get_document
@document.rxobj
end
def getFragment
def get_fragment
@document = super
return @document.rxobj.children
end

View file

@ -1,6 +1,6 @@
require 'html5lib/treebuilders/base'
require 'html5/treebuilders/base'
module HTML5lib
module HTML5
module TreeBuilders
module SimpleTree
@ -18,17 +18,17 @@ module HTML5lib
def initialize name
super
@name = name
@value = nil
@name = name
@value = nil
@attributes = {}
end
def appendChild node
if node.kind_of? TextNode and
childNodes.length>0 and childNodes[-1].kind_of? TextNode
childNodes[-1].value += node.value
childNodes.length > 0 and childNodes.last.kind_of? TextNode
childNodes.last.value += node.value
else
childNodes.push node
childNodes << node
end
node.parent = self
end
@ -55,8 +55,7 @@ module HTML5lib
def insertBefore node, refNode
index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and
childNodes[index-1].kind_of? TextNode
if node.kind_of?(TextNode) && index > 0 && childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].value += node.value
else
childNodes.insert index, node
@ -72,7 +71,7 @@ module HTML5lib
end
def hasContent
return (childNodes.length > 0)
childNodes.length > 0
end
end
@ -90,7 +89,7 @@ module HTML5lib
for child in childNodes
tree += child.printTree(indent)
end
return tree
tree
end
end
@ -108,13 +107,21 @@ module HTML5lib
for child in childNodes
tree += child.printTree(indent + 2)
end
return tree
tree
end
end
class DocumentType < Node
attr_accessor :public_id, :system_id
def to_s
"<!DOCTYPE %s>" % name
"<!DOCTYPE #{name}>"
end
def initialize name
super name
@public_id = nil
@system_id = nil
end
end
@ -157,19 +164,19 @@ module HTML5lib
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
node.printTree
end
def getFragment
def get_fragment
@document = super
return @document.childNodes
@document.childNodes
end
end

View file

@ -0,0 +1,26 @@
require 'html5/treewalkers/base'
module HTML5
module TreeWalkers
class << self
def [](name)
case name.to_s.downcase
when 'simpletree'
require 'html5/treewalkers/simpletree'
SimpleTree::TreeWalker
when 'rexml'
require 'html5/treewalkers/rexml'
REXML::TreeWalker
when 'hpricot'
require 'html5/treewalkers/hpricot'
Hpricot::TreeWalker
else
raise "Unknown TreeWalker #{name}"
end
end
alias :get_tree_walker :[]
end
end
end

View file

@ -0,0 +1,154 @@
require 'html5/constants'
module HTML5
module TreeWalkers
module TokenConstructor
def error(msg)
{:type => "SerializeError", :data => msg}
end
def normalize_attrs(attrs)
attrs.to_a
end
def empty_tag(name, attrs, has_children=false)
error(_("Void element has children")) if has_children
{:type => :EmptyTag, :name => name, :data => normalize_attrs(attrs)}
end
def start_tag(name, attrs)
{:type => :StartTag, :name => name, :data => normalize_attrs(attrs)}
end
def end_tag(name)
{:type => :EndTag, :name => name, :data => []}
end
def text(data)
if data =~ /\A([#{SPACE_CHARACTERS.join('')}]+)/m
yield({:type => :SpaceCharacters, :data => $1})
data = data[$1.length .. -1]
return if data.empty?
end
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)\Z/m
yield({:type => :Characters, :data => data[0 ... -$1.length]})
yield({:type => :SpaceCharacters, :data => $1})
else
yield({:type => :Characters, :data => data})
end
end
def comment(data)
{:type => :Comment, :data => data}
end
def doctype(name, public_id, system_id, correct=nil)
{:type => :Doctype, :name => name, :public_id => public_id, :system_id => system_id, :correct => correct}
end
def unknown(nodeType)
error(_("Unknown node type: ") + nodeType.to_s)
end
def _(str)
str
end
end
class Base
include TokenConstructor
def initialize(tree)
@tree = tree
end
def each
raise NotImplementedError
end
alias walk each
end
class NonRecursiveTreeWalker < TreeWalkers::Base
def node_details(node)
raise NotImplementedError
end
def first_child(node)
raise NotImplementedError
end
def next_sibling(node)
raise NotImplementedError
end
def parent(node)
raise NotImplementedError
end
def each
current_node = @tree
while current_node != nil
details = node_details(current_node)
has_children = false
case details.shift
when :DOCTYPE
yield doctype(*details)
when :TEXT
text(*details) {|token| yield token}
when :ELEMENT
name, attributes, has_children = details
if VOID_ELEMENTS.include?(name)
yield empty_tag(name, attributes.to_a, has_children)
has_children = false
else
yield start_tag(name, attributes.to_a)
end
when :COMMENT
yield comment(details[0])
when :DOCUMENT, :DOCUMENT_FRAGMENT
has_children = true
when nil
# ignore (REXML::XMLDecl is an example)
else
yield unknown(details[0])
end
first_child = has_children ? first_child(current_node) : nil
if first_child != nil
current_node = first_child
else
while current_node != nil
details = node_details(current_node)
if details.shift == :ELEMENT
name, attributes, has_children = details
yield end_tag(name) if !VOID_ELEMENTS.include?(name)
end
if @tree == current_node
current_node = nil
else
next_sibling = next_sibling(current_node)
if next_sibling != nil
current_node = next_sibling
break
end
current_node = parent(current_node)
end
end
end
end
end
end
end
end

View file

@ -1,10 +1,10 @@
require 'html5lib/treewalkers/base'
require 'html5/treewalkers/base'
require 'rexml/document'
module HTML5lib
module HTML5
module TreeWalkers
module Hpricot
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node
@ -13,17 +13,17 @@ module HTML5lib
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value]},
node.attributes.map {|name, value| [name, value]},
!node.empty?]
end
when ::Hpricot::Text
[:TEXT, node.to_plain_text]
[:TEXT, node.content]
when ::Hpricot::Comment
[:COMMENT, node.content]
when ::Hpricot::Doc
[:DOCUMENT]
when ::Hpricot::DocType
[:DOCTYPE, node.target]
[:DOCTYPE, node.target, node.public_id, node.system_id]
when ::Hpricot::XMLDecl
[nil]
else

View file

@ -1,10 +1,10 @@
require 'html5lib/treewalkers/base'
require 'html5/treewalkers/base'
require 'rexml/document'
module HTML5lib
module HTML5
module TreeWalkers
module REXML
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node
@ -23,7 +23,7 @@ module HTML5lib
when ::REXML::Comment
[:COMMENT, node.string]
when ::REXML::DocType
[:DOCTYPE, node.name]
[:DOCTYPE, node.name, node.public, node.system]
when ::REXML::XMLDecl
[nil]
else

View file

@ -1,10 +1,10 @@
require 'html5lib/treewalkers/base'
require 'html5/treewalkers/base'
module HTML5lib
module HTML5
module TreeWalkers
module SimpleTree
class TreeWalker < HTML5lib::TreeWalkers::Base
include HTML5lib::TreeBuilders::SimpleTree
class TreeWalker < HTML5::TreeWalkers::Base
include HTML5::TreeBuilders::SimpleTree
def walk(node)
case node
@ -12,20 +12,20 @@ module HTML5lib
return
when DocumentType
yield doctype(node.name)
yield doctype(node.name, node.public_id, node.system_id)
when TextNode
text(node.value) {|token| yield token}
when Element
if VOID_ELEMENTS.include?(node.name)
yield emptyTag(node.name, node.attributes, node.hasContent())
yield empty_tag(node.name, node.attributes, node.hasContent())
else
yield startTag(node.name, node.attributes)
yield start_tag(node.name, node.attributes)
for child in node.childNodes
walk(child) {|token| yield token}
end
yield endTag(node.name)
yield end_tag(node.name)
end
when CommentNode

View file

@ -0,0 +1,3 @@
module HTML5
VERSION = '0.1.0'
end

View file

@ -1,11 +0,0 @@
require 'html5lib/html5parser'
module HTML5lib
def self.parse(stream, options={})
HTMLParser.parse(stream, options)
end
def self.parseFragment(stream, options={})
HTMLParser.parse(stream, options)
end
end

View file

@ -1,708 +0,0 @@
module HTML5lib
class EOF < Exception; end
CONTENT_MODEL_FLAGS = [
:PCDATA,
:RCDATA,
:CDATA,
:PLAINTEXT
]
SCOPING_ELEMENTS = %w[
button
caption
html
marquee
object
table
td
th
]
FORMATTING_ELEMENTS = %w[
a
b
big
em
font
i
nobr
s
small
strike
strong
tt
u
]
SPECIAL_ELEMENTS = %w[
address
area
base
basefont
bgsound
blockquote
body
br
center
col
colgroup
dd
dir
div
dl
dt
embed
fieldset
form
frame
frameset
h1
h2
h3
h4
h5
h6
head
hr
iframe
image
img
input
isindex
li
link
listing
menu
meta
noembed
noframes
noscript
ol
optgroup
option
p
param
plaintext
pre
script
select
spacer
style
tbody
textarea
tfoot
thead
title
tr
ul
wbr
]
SPACE_CHARACTERS = %W[
\t
\n
\x0B
\x0C
\x20
\r
]
TABLE_INSERT_MODE_ELEMENTS = %w[
table
tbody
tfoot
thead
tr
]
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
DIGITS = '0'..'9'
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
# Heading elements need to be ordered
HEADING_ELEMENTS = %w[
h1
h2
h3
h4
h5
h6
]
# XXX What about event-source and command?
VOID_ELEMENTS = %w[
base
link
meta
hr
br
img
embed
param
area
col
input
]
CDATA_ELEMENTS = %w[title textarea]
RCDATA_ELEMENTS = %w[
style
script
xmp
iframe
noembed
noframes
noscript
]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],
'img' => %w[ismap],
'audio' => %w[autoplay controls],
'video' => %w[autoplay controls],
'script' => %w[defer async],
'details' => %w[open],
'datagrid' => %w[multiple disabled],
'command' => %w[hidden disabled checked default],
'menu' => %w[autosubmit],
'fieldset' => %w[disabled readonly],
'option' => %w[disabled readonly selected],
'optgroup' => %w[disabled readonly],
'button' => %w[disabled autofocus],
'input' => %w[disabled readonly required autofocus checked ismap],
'select' => %w[disabled readonly autofocus multiple],
'output' => %w[disabled readonly]
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN
65533, # 0x81 UNDEFINED
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
8224, # 0x86 0x2020 DAGGER
8225, # 0x87 0x2021 DOUBLE DAGGER
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
8240, # 0x89 0x2030 PER MILLE SIGN
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
65533, # 0x8D UNDEFINED
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
65533, # 0x8F UNDEFINED
65533, # 0x90 UNDEFINED
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
8226, # 0x95 0x2022 BULLET
8211, # 0x96 0x2013 EN DASH
8212, # 0x97 0x2014 EM DASH
732, # 0x98 0x02DC SMALL TILDE
8482, # 0x99 0x2122 TRADE MARK SIGN
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
65533, # 0x9D UNDEFINED
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
]
private
def self.U n
[n].pack('U')
end
public
ENTITIES = {
"AElig" => U(0xC6),
"Aacute" => U(0xC1),
"Acirc" => U(0xC2),
"Agrave" => U(0xC0),
"Alpha" => U(0x0391),
"Aring" => U(0xC5),
"Atilde" => U(0xC3),
"Auml" => U(0xC4),
"Beta" => U(0x0392),
"Ccedil" => U(0xC7),
"Chi" => U(0x03A7),
"Dagger" => U(0x2021),
"Delta" => U(0x0394),
"ETH" => U(0xD0),
"Eacute" => U(0xC9),
"Ecirc" => U(0xCA),
"Egrave" => U(0xC8),
"Epsilon" => U(0x0395),
"Eta" => U(0x0397),
"Euml" => U(0xCB),
"Gamma" => U(0x0393),
"Iacute" => U(0xCD),
"Icirc" => U(0xCE),
"Igrave" => U(0xCC),
"Iota" => U(0x0399),
"Iuml" => U(0xCF),
"Kappa" => U(0x039A),
"Lambda" => U(0x039B),
"Mu" => U(0x039C),
"Ntilde" => U(0xD1),
"Nu" => U(0x039D),
"OElig" => U(0x0152),
"Oacute" => U(0xD3),
"Ocirc" => U(0xD4),
"Ograve" => U(0xD2),
"Omega" => U(0x03A9),
"Omicron" => U(0x039F),
"Oslash" => U(0xD8),
"Otilde" => U(0xD5),
"Ouml" => U(0xD6),
"Phi" => U(0x03A6),
"Pi" => U(0x03A0),
"Prime" => U(0x2033),
"Psi" => U(0x03A8),
"Rho" => U(0x03A1),
"Scaron" => U(0x0160),
"Sigma" => U(0x03A3),
"THORN" => U(0xDE),
"Tau" => U(0x03A4),
"Theta" => U(0x0398),
"Uacute" => U(0xDA),
"Ucirc" => U(0xDB),
"Ugrave" => U(0xD9),
"Upsilon" => U(0x03A5),
"Uuml" => U(0xDC),
"Xi" => U(0x039E),
"Yacute" => U(0xDD),
"Yuml" => U(0x0178),
"Zeta" => U(0x0396),
"aacute" => U(0xE1),
"acirc" => U(0xE2),
"acute" => U(0xB4),
"aelig" => U(0xE6),
"agrave" => U(0xE0),
"alefsym" => U(0x2135),
"alpha" => U(0x03B1),
"amp" => U(0x26),
"AMP" => U(0x26),
"and" => U(0x2227),
"ang" => U(0x2220),
"apos" => U(0x27),
"aring" => U(0xE5),
"asymp" => U(0x2248),
"atilde" => U(0xE3),
"auml" => U(0xE4),
"bdquo" => U(0x201E),
"beta" => U(0x03B2),
"brvbar" => U(0xA6),
"bull" => U(0x2022),
"cap" => U(0x2229),
"ccedil" => U(0xE7),
"cedil" => U(0xB8),
"cent" => U(0xA2),
"chi" => U(0x03C7),
"circ" => U(0x02C6),
"clubs" => U(0x2663),
"cong" => U(0x2245),
"copy" => U(0xA9),
"COPY" => U(0xA9),
"crarr" => U(0x21B5),
"cup" => U(0x222A),
"curren" => U(0xA4),
"dArr" => U(0x21D3),
"dagger" => U(0x2020),
"darr" => U(0x2193),
"deg" => U(0xB0),
"delta" => U(0x03B4),
"diams" => U(0x2666),
"divide" => U(0xF7),
"eacute" => U(0xE9),
"ecirc" => U(0xEA),
"egrave" => U(0xE8),
"empty" => U(0x2205),
"emsp" => U(0x2003),
"ensp" => U(0x2002),
"epsilon" => U(0x03B5),
"equiv" => U(0x2261),
"eta" => U(0x03B7),
"eth" => U(0xF0),
"euml" => U(0xEB),
"euro" => U(0x20AC),
"exist" => U(0x2203),
"fnof" => U(0x0192),
"forall" => U(0x2200),
"frac12" => U(0xBD),
"frac14" => U(0xBC),
"frac34" => U(0xBE),
"frasl" => U(0x2044),
"gamma" => U(0x03B3),
"ge" => U(0x2265),
"gt" => U(0x3E),
"GT" => U(0x3E),
"hArr" => U(0x21D4),
"harr" => U(0x2194),
"hearts" => U(0x2665),
"hellip" => U(0x2026),
"iacute" => U(0xED),
"icirc" => U(0xEE),
"iexcl" => U(0xA1),
"igrave" => U(0xEC),
"image" => U(0x2111),
"infin" => U(0x221E),
"int" => U(0x222B),
"iota" => U(0x03B9),
"iquest" => U(0xBF),
"isin" => U(0x2208),
"iuml" => U(0xEF),
"kappa" => U(0x03BA),
"lArr" => U(0x21D0),
"lambda" => U(0x03BB),
"lang" => U(0x2329),
"laquo" => U(0xAB),
"larr" => U(0x2190),
"lceil" => U(0x2308),
"ldquo" => U(0x201C),
"le" => U(0x2264),
"lfloor" => U(0x230A),
"lowast" => U(0x2217),
"loz" => U(0x25CA),
"lrm" => U(0x200E),
"lsaquo" => U(0x2039),
"lsquo" => U(0x2018),
"lt" => U(0x3C),
"LT" => U(0x3C),
"macr" => U(0xAF),
"mdash" => U(0x2014),
"micro" => U(0xB5),
"middot" => U(0xB7),
"minus" => U(0x2212),
"mu" => U(0x03BC),
"nabla" => U(0x2207),
"nbsp" => U(0xA0),
"ndash" => U(0x2013),
"ne" => U(0x2260),
"ni" => U(0x220B),
"not" => U(0xAC),
"notin" => U(0x2209),
"nsub" => U(0x2284),
"ntilde" => U(0xF1),
"nu" => U(0x03BD),
"oacute" => U(0xF3),
"ocirc" => U(0xF4),
"oelig" => U(0x0153),
"ograve" => U(0xF2),
"oline" => U(0x203E),
"omega" => U(0x03C9),
"omicron" => U(0x03BF),
"oplus" => U(0x2295),
"or" => U(0x2228),
"ordf" => U(0xAA),
"ordm" => U(0xBA),
"oslash" => U(0xF8),
"otilde" => U(0xF5),
"otimes" => U(0x2297),
"ouml" => U(0xF6),
"para" => U(0xB6),
"part" => U(0x2202),
"permil" => U(0x2030),
"perp" => U(0x22A5),
"phi" => U(0x03C6),
"pi" => U(0x03C0),
"piv" => U(0x03D6),
"plusmn" => U(0xB1),
"pound" => U(0xA3),
"prime" => U(0x2032),
"prod" => U(0x220F),
"prop" => U(0x221D),
"psi" => U(0x03C8),
"quot" => U(0x22),
"QUOT" => U(0x22),
"rArr" => U(0x21D2),
"radic" => U(0x221A),
"rang" => U(0x232A),
"raquo" => U(0xBB),
"rarr" => U(0x2192),
"rceil" => U(0x2309),
"rdquo" => U(0x201D),
"real" => U(0x211C),
"reg" => U(0xAE),
"REG" => U(0xAE),
"rfloor" => U(0x230B),
"rho" => U(0x03C1),
"rlm" => U(0x200F),
"rsaquo" => U(0x203A),
"rsquo" => U(0x2019),
"sbquo" => U(0x201A),
"scaron" => U(0x0161),
"sdot" => U(0x22C5),
"sect" => U(0xA7),
"shy" => U(0xAD),
"sigma" => U(0x03C3),
"sigmaf" => U(0x03C2),
"sim" => U(0x223C),
"spades" => U(0x2660),
"sub" => U(0x2282),
"sube" => U(0x2286),
"sum" => U(0x2211),
"sup" => U(0x2283),
"sup1" => U(0xB9),
"sup2" => U(0xB2),
"sup3" => U(0xB3),
"supe" => U(0x2287),
"szlig" => U(0xDF),
"tau" => U(0x03C4),
"there4" => U(0x2234),
"theta" => U(0x03B8),
"thetasym" => U(0x03D1),
"thinsp" => U(0x2009),
"thorn" => U(0xFE),
"tilde" => U(0x02DC),
"times" => U(0xD7),
"trade" => U(0x2122),
"uArr" => U(0x21D1),
"uacute" => U(0xFA),
"uarr" => U(0x2191),
"ucirc" => U(0xFB),
"ugrave" => U(0xF9),
"uml" => U(0xA8),
"upsih" => U(0x03D2),
"upsilon" => U(0x03C5),
"uuml" => U(0xFC),
"weierp" => U(0x2118),
"xi" => U(0x03BE),
"yacute" => U(0xFD),
"yen" => U(0xA5),
"yuml" => U(0xFF),
"zeta" => U(0x03B6),
"zwj" => U(0x200D),
"zwnj" => U(0x200C)
}
ENCODINGS = %w[
ansi_x3.4-1968
iso-ir-6
ansi_x3.4-1986
iso_646.irv:1991
ascii
iso646-us
us-ascii
us
ibm367
cp367
csascii
ks_c_5601-1987
korean
iso-2022-kr
csiso2022kr
euc-kr
iso-2022-jp
csiso2022jp
iso-2022-jp-2
iso-ir-58
chinese
csiso58gb231280
iso_8859-1:1987
iso-ir-100
iso_8859-1
iso-8859-1
latin1
l1
ibm819
cp819
csisolatin1
iso_8859-2:1987
iso-ir-101
iso_8859-2
iso-8859-2
latin2
l2
csisolatin2
iso_8859-3:1988
iso-ir-109
iso_8859-3
iso-8859-3
latin3
l3
csisolatin3
iso_8859-4:1988
iso-ir-110
iso_8859-4
iso-8859-4
latin4
l4
csisolatin4
iso_8859-6:1987
iso-ir-127
iso_8859-6
iso-8859-6
ecma-114
asmo-708
arabic
csisolatinarabic
iso_8859-7:1987
iso-ir-126
iso_8859-7
iso-8859-7
elot_928
ecma-118
greek
greek8
csisolatingreek
iso_8859-8:1988
iso-ir-138
iso_8859-8
iso-8859-8
hebrew
csisolatinhebrew
iso_8859-5:1988
iso-ir-144
iso_8859-5
iso-8859-5
cyrillic
csisolatincyrillic
iso_8859-9:1989
iso-ir-148
iso_8859-9
iso-8859-9
latin5
l5
csisolatin5
iso-8859-10
iso-ir-157
l6
iso_8859-10:1992
csisolatin6
latin6
hp-roman8
roman8
r8
ibm037
cp037
csibm037
ibm424
cp424
csibm424
ibm437
cp437
437
cspc8codepage437
ibm500
cp500
csibm500
ibm775
cp775
cspc775baltic
ibm850
cp850
850
cspc850multilingual
ibm852
cp852
852
cspcp852
ibm855
cp855
855
csibm855
ibm857
cp857
857
csibm857
ibm860
cp860
860
csibm860
ibm861
cp861
861
cp-is
csibm861
ibm862
cp862
862
cspc862latinhebrew
ibm863
cp863
863
csibm863
ibm864
cp864
csibm864
ibm865
cp865
865
csibm865
ibm866
cp866
866
csibm866
ibm869
cp869
869
cp-gr
csibm869
ibm1026
cp1026
csibm1026
koi8-r
cskoi8r
koi8-u
big5-hkscs
ptcp154
csptcp154
pt154
cp154
utf-7
utf-16be
utf-16le
utf-16
utf-8
iso-8859-13
iso-8859-14
iso-ir-199
iso_8859-14:1998
iso_8859-14
latin8
iso-celtic
l8
iso-8859-15
iso_8859-15
iso-8859-16
iso-ir-226
iso_8859-16:2001
iso_8859-16
latin10
l10
gbk
cp936
ms936
gb18030
shift_jis
ms_kanji
csshiftjis
euc-jp
gb2312
big5
csbig5
windows-1250
windows-1251
windows-1252
windows-1253
windows-1254
windows-1255
windows-1256
windows-1257
windows-1258
tis-620
hz-gb-2312
]
end

View file

@ -1 +0,0 @@
require 'html5lib/filters/optionaltags'

View file

@ -1,57 +0,0 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
handle_start 'html', 'frameset', 'frame', 'noframes'
handle_end 'frameset', 'noframes'
def processCharacters(data)
@parser.parseError(_('Unexpected characters in the frameset phase. Characters ignored.'))
end
def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes)
end
def startTagFrame(name, attributes)
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the frameset phase. Ignored"))
end
def endTagFrameset(name)
if @tree.openElements[-1].name == 'html'
# innerHTML case
@parser.parseError(_("Unexpected end tag token (frameset) in the frameset phase (innerHTML)."))
else
@tree.openElements.pop
end
if (not @parser.innerHTML and
@tree.openElements[-1].name != 'frameset')
# If we're not in innerHTML mode and the the current node is not a
# "frameset" element (anymore) then switch.
@parser.phase = @parser.phases[:afterFrameset]
end
end
def endTagNoframes(name)
@parser.phases[:inBody].processEndTag(name)
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the frameset phase. Ignored."))
end
end
end

View file

@ -1,126 +0,0 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InHeadPhase < Phase
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
handle_end 'head'
handle_end %w( html body br ) => 'ImplyAfterHead'
handle_end %w( title style script )
def processEOF
if ['title', 'style', 'script'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected end of file. Expected end tag (#{name})."))
@tree.openElements.pop
end
anythingElse
@parser.phase.processEOF
end
def processCharacters(data)
if ['title', 'style', 'script'].include?(@tree.openElements[-1].name)
@tree.insertText(data)
else
anythingElse
@parser.phase.processCharacters(data)
end
end
def startTagHead(name, attributes)
@parser.parseError(_('Unexpected start tag head in existing head. Ignored'))
end
def startTagTitle(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :RCDATA
end
def startTagStyle(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagScript(name, attributes)
#XXX Inner HTML case may be wrong
element = @tree.createElement(name, attributes)
element._flags.push("parser-inserted")
if (@tree.headPointer != nil and
@parser.phase == @parser.phases[:inHead])
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagBaseLinkMeta(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
end
def startTagOther(name, attributes)
anythingElse
@parser.phase.processStartTag(name, attributes)
end
def endTagHead(name)
if @tree.openElements[-1].name == 'head'
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (head). Ignored."))
end
@parser.phase = @parser.phases[:afterHead]
end
def endTagImplyAfterHead(name)
anythingElse
@parser.phase.processEndTag(name)
end
def endTagTitleStyleScript(name)
if @tree.openElements[-1].name == name
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def anythingElse
if @tree.openElements[-1].name == 'head'
endTagHead('head')
else
@parser.phase = @parser.phases[:afterHead]
end
end
protected
def appendToHead(element)
if @tree.headPointer.nil?
assert @parser.innerHTML
@tree.openElements[-1].appendChild(element)
else
@tree.headPointer.appendChild(element)
end
end
end
end

View file

@ -1,84 +0,0 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class InSelectPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
handle_start 'html', 'option', 'optgroup', 'select'
handle_end 'option', 'optgroup', 'select', %w( caption table tbody tfoot thead tr td th ) => 'TableElements'
def processCharacters(data)
@tree.insertText(data)
end
def startTagOption(name, attributes)
# We need to imply </option> if <option> is the current node.
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.insertElement(name, attributes)
end
def startTagOptgroup(name, attributes)
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.openElements.pop if @tree.openElements[-1].name == 'optgroup'
@tree.insertElement(name, attributes)
end
def startTagSelect(name, attributes)
@parser.parseError(_('Unexpected start tag (select) in the select phase implies select start tag.'))
endTagSelect('select')
end
def startTagOther(name, attributes)
@parser.parseError(_('Unexpected start tag token (#{name}) in the select phase. Ignored.'))
end
def endTagOption(name)
if @tree.openElements[-1].name == 'option'
@tree.openElements.pop
else
@parser.parseError(_('Unexpected end tag (option) in the select phase. Ignored.'))
end
end
def endTagOptgroup(name)
# </optgroup> implicitly closes <option>
if @tree.openElements[-1].name == 'option' and @tree.openElements[-2].name == 'optgroup'
@tree.openElements.pop
end
# It also closes </optgroup>
if @tree.openElements[-1].name == 'optgroup'
@tree.openElements.pop
# But nothing else
else
@parser.parseError(_('Unexpected end tag (optgroup) in the select phase. Ignored.'))
end
end
def endTagSelect(name)
if in_scope?('select', true)
remove_open_elements_until('select')
@parser.resetInsertionMode
else
# innerHTML case
@parser.parseError
end
end
def endTagTableElements(name)
@parser.parseError(_("Unexpected table end tag (#{name}) in the select phase."))
if in_scope?(name, true)
endTagSelect('select')
@parser.phase.processEndTag(name)
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the select phase. Ignored."))
end
end
end

View file

@ -1,36 +0,0 @@
require 'html5lib/html5parser/phase'
module HTML5lib
class TrailingEndPhase < Phase
def processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processSpaceCharacters(data)
@parser.lastPhase.processSpaceCharacters(data)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_('Unexpected start tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
@parser.parseError(_('Unexpected end tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processEndTag(name)
end
end
end

View file

@ -1,2 +0,0 @@
require 'html5lib/serializer/htmlserializer'
require 'html5lib/serializer/xhtmlserializer'

View file

@ -1,19 +0,0 @@
require 'html5lib/serializer/htmlserializer'
module HTML5lib
class XHTMLSerializer < HTMLSerializer
DEFAULTS = {
:quote_attr_values => true,
:minimize_boolean_attributes => false,
:use_trailing_solidus => true,
:escape_lt_in_attrs => true,
:omit_optional_tags => false
}
def initialize(options={})
super(DEFAULTS.clone.update(options))
end
end
end

File diff suppressed because it is too large Load diff

View file

@ -1,26 +0,0 @@
require 'html5lib/treewalkers/base'
module HTML5lib
module TreeWalkers
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treewalkers/simpletree'
SimpleTree::TreeWalker
when 'rexml' then
require 'html5lib/treewalkers/rexml'
REXML::TreeWalker
when 'hpricot' then
require 'html5lib/treewalkers/hpricot'
Hpricot::TreeWalker
else
raise "Unknown TreeWalker #{name}"
end
end
alias :getTreeWalker :[]
end
end
end

View file

@ -1,156 +0,0 @@
require 'html5lib/constants'
module HTML5lib
module TreeWalkers
module TokenConstructor
def error(msg)
return {:type => "SerializeError", :data => msg}
end
def normalizeAttrs(attrs)
attrs.to_a
end
def emptyTag(name, attrs, hasChildren=false)
error(_("Void element has children")) if hasChildren
return({:type => :EmptyTag, :name => name, \
:data => normalizeAttrs(attrs)})
end
def startTag(name, attrs)
return {:type => :StartTag, :name => name, \
:data => normalizeAttrs(attrs)}
end
def endTag(name)
return {:type => :EndTag, :name => name, :data => []}
end
def text(data)
if data =~ /\A([#{SPACE_CHARACTERS.join('')}]+)/m
yield({:type => :SpaceCharacters, :data => $1})
data = data[$1.length .. -1]
return if data.empty?
end
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)\Z/m
yield({:type => :Characters, :data => data[0 ... -$1.length]})
yield({:type => :SpaceCharacters, :data => $1})
else
yield({:type => :Characters, :data => data})
end
end
def comment(data)
return {:type => :Comment, :data => data}
end
def doctype(name)
return {:type => :Doctype, :name => name, :data => name.upcase() == "HTML"}
end
def unknown(nodeType)
return error(_("Unknown node type: ") + nodeType.to_s)
end
def _(str)
str
end
end
class Base
include TokenConstructor
def initialize(tree)
@tree = tree
end
def each
raise NotImplementedError
end
alias walk each
end
class NonRecursiveTreeWalker < TreeWalkers::Base
def node_details(node)
raise NotImplementedError
end
def first_child(node)
raise NotImplementedError
end
def next_sibling(node)
raise NotImplementedError
end
def parent(node)
raise NotImplementedError
end
def each
currentNode = @tree
while currentNode != nil
details = node_details(currentNode)
hasChildren = false
case details.shift
when :DOCTYPE
yield doctype(*details)
when :TEXT
text(*details) {|token| yield token}
when :ELEMENT
name, attributes, hasChildren = details
if VOID_ELEMENTS.include?(name)
yield emptyTag(name, attributes.to_a, hasChildren)
hasChildren = false
else
yield startTag(name, attributes.to_a)
end
when :COMMENT
yield comment(details[0])
when :DOCUMENT, :DOCUMENT_FRAGMENT
hasChildren = true
when nil
# ignore (REXML::XMLDecl is an example)
else
yield unknown(details[0])
end
firstChild = hasChildren ? first_child(currentNode) : nil
if firstChild != nil
currentNode = firstChild
else
while currentNode != nil
details = node_details(currentNode)
if details.shift == :ELEMENT
name, attributes, hasChildren = details
yield endTag(name) if !VOID_ELEMENTS.include?(name)
end
if @tree == currentNode
currentNode = nil
else
nextSibling = next_sibling(currentNode)
if nextSibling != nil
currentNode = nextSibling
break
end
currentNode = parent(currentNode)
end
end
end
end
end
end
end
end

View file

@ -26,15 +26,15 @@ def parse(opts, args)
exit(1)
end
require 'html5lib/treebuilders'
treebuilder = HTML5lib::TreeBuilders[opts.treebuilder]
require 'html5/treebuilders'
treebuilder = HTML5::TreeBuilders[opts.treebuilder]
if opts.output == :xml
require 'html5lib/liberalxmlparser'
p = HTML5lib::XHTMLParser.new(:tree=>treebuilder)
require 'html5/liberalxmlparser'
p = HTML5::XHTMLParser.new(:tree=>treebuilder)
else
require 'html5lib/html5parser'
p = HTML5lib::HTMLParser.new(:tree=>treebuilder)
require 'html5/html5parser'
p = HTML5::HTMLParser.new(:tree=>treebuilder)
end
if opts.parsemethod == :parse
@ -70,10 +70,10 @@ def printOutput(parser, document, opts)
when :xml
print document
when :html
require 'html5lib/treewalkers'
tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
require 'html5lib/serializer'
puts HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer)
require 'html5/treewalkers'
tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
require 'html5/serializer'
puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
when :hilite
print document.hilite
when :tree
@ -188,6 +188,10 @@ opts = OptionParser.new do |opts|
options.serializer[:escape_lt_in_attrs] = lt
end
opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
options.serializer[:escape_rcdata] = rcdata
end
opts.separator ""
opts.separator "Other Options:"

View file

@ -33,7 +33,6 @@ EUC-jp
#encoding
EUC-jp
#data
<!-- -->
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">

View file

@ -92,7 +92,8 @@
{"description": "rcdata",
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a<b>c&d"]
"expected": ["<script>a<b>c&d"],
"xhtml": ["<script>a&lt;b&gt;c&amp;d"]
},
{"description": "doctype",

View file

@ -49,6 +49,12 @@
"options": {"escape_lt_in_attrs": true},
"input": [["StartTag", "a", {"title": "a<b>c&d"}]],
"expected": ["<a title=\"a&lt;b>c&amp;d\">"]
},
{"description": "rcdata",
"options": {"escape_rcdata": true},
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a&lt;b&gt;c&amp;d"]
}
]}

View file

@ -3,13 +3,13 @@
{"description": "bare text with leading spaces",
"options": {"strip_whitespace": true},
"input": [["Characters", "\t\r\n\u000B\u000C foo"]],
"expected": ["foo"]
"expected": [" foo"]
},
{"description": "bare text with trailing spaces",
"options": {"strip_whitespace": true},
"input": [["Characters", "foo \t\r\n\u000B\u000C"]],
"expected": ["foo"]
"expected": ["foo "]
},
{"description": "bare text with inner spaces",

View file

@ -0,0 +1,43 @@
[
{"type": "text/html", "input": ""},
{"type": "text/html", "input": "<!---->"},
{"type": "text/html", "input": "<!--asdfaslkjdf;laksjdf as;dkfjsd-->"},
{"type": "text/html", "input": "<!"},
{"type": "text/html", "input": "\t"},
{"type": "text/html", "input": "<!>"},
{"type": "text/html", "input": "<?"},
{"type": "text/html", "input": "<??>"},
{"type": "application/rss+xml", "input": "<rss"},
{"type": "application/atom+xml", "input": "<feed"},
{"type": "text/html", "input": "<html"},
{"type": "text/html", "input": "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n<html><head>\n<title>302 Found</title>\n</head><body>\n<h1>Found</h1>\n<p>The document has moved <a href=\"http://feeds.feedburner.com/gofug\">here</a>.</p>\n</body></html>\n"},
{"type": "text/html", "input": "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">\r\n<HTML><HEAD>\r\n <link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/289619328/feed.css\" /><link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/431602649/feed.css\" />\r\n<link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/382549546/feed.css\" />\r\n<link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/314618017/feed.css\" /><META http-equiv=\"expires\" content="},
{"type": "text/html", "input": "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\r\n<html>\r\n<head>\r\n<title>Xiaxue - Chicken pie blogger.</title><meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\"><style type=\"text/css\">\r\n<style type=\"text/css\">\r\n<!--\r\nbody {\r\n background-color: #FFF2F2;\r\n}\r\n.style1 {font-family: Georgia, \"Times New Roman\", Times, serif}\r\n.style2 {\r\n color: #8a567c;\r\n font-size: 14px;\r\n font-family: Georgia, \"Times New Roman\", Times, serif;\r\n}\r"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<head> \r\n<title>Google Operating System</title>\r\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"Description\" content=\"Unofficial news and tips about Google. A blog that watches Google's latest developments and the attempts to move your operating system online.\" />\r\n<meta name=\"generator\" c"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<head>\r\n <title>Assimilated Press</title> <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"MSSmartTagsPreventParsing\" content=\"true\" />\r\n<meta name=\"generator\" content=\"Blogger\" />\r\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"Assimilated Press - Atom\" href=\"http://assimila"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<head>\r\n <title>PostSecret</title>\r\n<META name=\"keywords\" Content=\"secrets, postcard, secret, postcards, postsecret, postsecrets,online confessional, post secret, post secrets, artomatic, post a secret\"><META name=\"discription\" Content=\"See a Secret...Share a Secret\"> <meta http-equiv=\"Content-Type\" content=\"te"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns='http://www.w3.org/1999/xhtml' xmlns:b='http://www.google.com/2005/gml/b' xmlns:data='http://www.google.com/2005/gml/data' xmlns:expr='http://www.google.com/2005/gml/expr'>\n <head>\n \n <meta content='text/html; charset=UTF-8' http-equiv='Content-Type'/>\n <meta content='true' name='MSSmartTagsPreventParsing'/>\n <meta content='blogger' name='generator'/>\n <link rel=\"alternate\" typ"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"ja\">\n<head profile=\"http://gmpg.org/xfn/11\"> \n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" /> \n<title> CMS Lever</title><link rel=\"stylesheet\" type=\"text/css\" media=\"screen\" href=\"http://s.wordpress.com/wp-content/themes/pub/twenty-eight/2813.css\"/>\n<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS 2.0\" h"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"en\"><head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n<title> Park Avenue Peerage</title>\t<meta name=\"generator\" content=\"WordPress.com\" />\t<!-- feeds -->\n\t<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS 2.0\" href=\"http://parkavenuepeerage.wordpress.com/feed/\" />\t<link rel=\"pingback\" href="},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"ja\"><head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n<title> \u884c\u96f2\u6d41\u6c34 -like a floating clouds and running water-</title>\t<meta name=\"generator\" content=\"WordPress.com\" />\t<!-- feeds -->\n\t<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS 2.0\" href=\"http://shw4.wordpress.com/feed/\" />\t<li"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n<title>Go Fug Yourself</title><link rel=\"stylesheet\" href=\"http://gofugyourself.typepad.com/go_fug_yourself/styles.css\" type=\"text/css\" />\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"Atom\" "},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"en\"><head profile=\"http://gmpg.org/xfn/11\">\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" /><title> Ladies&#8230;</title><meta name=\"generator\" content=\"WordPress.com\" /> <!-- leave this for stats --><link rel=\"stylesheet\" href=\"http://s.wordpress.com/wp-content/themes/default/style.css?1\" type=\"tex"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\r\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\r\n<head>\r\n <title>The Sartorialist</title> <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"MSSmartTagsPreventParsing\" content=\"true\" />\r\n<meta name=\"generator\" content=\"Blogger\" />\r\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"The Sartorialist - Atom\" href=\"http://thesartorialist.blogspot"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\" />\n<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n<title>Creating Passionate Users</title><link rel=\"stylesheet\" href=\"http://headrush.typepad.com/creating_passionate_users/styles.css\" type=\"text/css\" />\n<link rel=\"alternate\" type"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" id=\"sixapart-standard\">\n<head>\n\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n\t<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n\t\n\t\n <meta name=\"keywords\" content=\"marketing, blog, seth, ideas, respect, permission\" />\n <meta name=\"description\" content=\"Seth Godin's riffs on marketing, respect, and the "},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" id=\"sixapart-standard\">\n<head>\n\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n\t<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n\t\n\t\n \n <meta name=\"description\" content=\" Western Civilization hangs in the balance. This blog is part of the solution,the cure. Get your heads out of the sand and Fight the G"},
{"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"en\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=pahrefhttpwwwfeedburnercomtarget_blankimgsrchttpwwwfeedburnercomfbimagespubpowered_by_fbgifaltPoweredbyFeedBurnerstyleborder0ap\" />\n<title> From Under the Rotunda</title>\n<link rel=\"stylesheet\" href=\"http://s.wordpress.com/wp-content/themes/pub/andreas04/style.css\" type=\"text/css\""},
{"type": "application/atom+xml", "input": "<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href=\"http://www.blogger.com/styles/atom.css\" type=\"text/css\"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/'><id>tag:blogger.com,1999:blog-10861780</id><updated>2007-07-27T12:38:50.888-07:00</updated><title type='text'>Official Google Blog</title><link rel='alternate' type='text/html' href='http://googleblog.blogspot.com/'/><link rel='next' type='application/atom+xml' href='http://googleblog.blogs"},
{"type": "application/rss+xml", "input": "<?xml version='1.0' encoding='UTF-8'?><rss xmlns:atom='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' version='2.0'><channel><atom:id>tag:blogger.com,1999:blog-10861780</atom:id><lastBuildDate>Fri, 27 Jul 2007 19:38:50 +0000</lastBuildDate><title>Official Google Blog</title><description/><link>http://googleblog.blogspot.com/</link><managingEditor>Eric Case</managingEditor><generator>Blogger</generator><openSearch:totalResults>729</openSearch:totalResults><openSearc"},
{"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"pahrefhttpwwwfeedburnercomtarget_blankimgsrchttpwwwfeedburnercomfbimagespubpowered_by_fbgifaltPoweredbyFeedBurnerstyleborder0ap\"?>\n<!-- generator=\"wordpress/MU\" -->\n<rss version=\"2.0\"\n\txmlns:content=\"http://purl.org/rss/1.0/modules/content/\"\n\txmlns:wfw=\"http://wellformedweb.org/CommentAPI/\"\n\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n\t><channel>\n\t<title>From Under the Rotunda</title>\n\t<link>http://dannybernardi.wordpress.com</link>\n\t<description>The Monographs of Danny Ber"},
{"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!-- generator=\"wordpress/MU\" -->\n<rss version=\"2.0\"\n\txmlns:content=\"http://purl.org/rss/1.0/modules/content/\"\n\txmlns:wfw=\"http://wellformedweb.org/CommentAPI/\"\n\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n\t><channel>\n\t<title>CMS Lever</title>\n\t<link>http://kanaguri.wordpress.com</link>\n\t<description>CMS\u306e\u6c17\u306b\u306a\u3063\u305f\u3053\u3068</description>\n\t<pubDate>Wed, 18 Jul 2007 21:26:22 +0000</pubDate>\n\t<generator>http://wordpress.org/?v=MU</generator>\n\t<language>ja</languag"},
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:thr=\"http://purl.org/syndication/thread/1.0\">\n <title>Atlas Shrugs</title>\n <link rel=\"self\" type=\"application/atom+xml\" href=\"http://atlasshrugs2000.typepad.com/atlas_shrugs/atom.xml\" />\n <link rel=\"alternate\" type=\"text/html\" href=\"http://atlasshrugs2000.typepad.com/atlas_shrugs/\" />\n <id>tag:typepad.com,2003:weblog-132946</id>\n <updated>2007-08-15T16:07:34-04"},
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atom10full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:thr=\"http://purl.org/syndication/thread/1.0\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\">\r\n <title>Creating Passionate Users</title>\r\n "},
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atom10full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\">\r\n <title>Seth's Blog</title>\r\n <link rel=\"alternate\" type=\"text/html\" href=\"http://sethgodin.typepad.com/seths_blog/\" />\r\n <link rel=\"s"},
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atom10full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:openSearch=\"http://a9.com/-/spec/opensearchrss/1.0/\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\"><id>tag:blogger.com,1999:blog-32454861</id><updated>2007-07-31T21:44:09.867+02:00</upd"},
{"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atomfull.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://purl.org/atom/ns#\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\" version=\"0.3\">\r\n <title>Go Fug Yourself</title>\r\n <link rel=\"alternate\" type=\"text/html\" href=\"http://go"},
{"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/rss2full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><rss xmlns:creativeCommons=\"http://backend.userland.com/creativeCommonsRssModule\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\" version=\"2.0\"><channel><title>Google Operating System</title><link>http://googlesystem.blogspot.com/</link>"},
{"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"\"?>\n<!-- generator=\"wordpress/MU\" -->\n<rss version=\"2.0\"\n\txmlns:content=\"http://purl.org/rss/1.0/modules/content/\"\n\txmlns:wfw=\"http://wellformedweb.org/CommentAPI/\"\n\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n\t><channel>\n\t<title>Nunublog</title>\n\t<link>http://nunubh.wordpress.com</link>\n\t<description>Just Newbie Blog!</description>\n\t<pubDate>Mon, 09 Jul 2007 18:54:09 +0000</pubDate>\n\t<generator>http://wordpress.org/?v=MU</generator>\n\t<language>id</language>\n\t\t\t<item>\n\t\t<ti"},
{"type": "text/html", "input": "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<HEAD>\r\n<TITLE>Design*Sponge</TITLE><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"MSSmartTagsPreventParsing\" content=\"true\" />\r\n<meta name=\"generator\" content=\"Blogger\" />\r\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"Design*Sponge - Atom\" href=\"http://designsponge.blogspot.com/feeds/posts/default\" />\r\n<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Design*Sponge - RSS\" href="},
{"type": "text/html", "input": "<HTML>\n<HEAD>\n<TITLE>Moved Temporarily</TITLE>\n</HEAD>\n<BODY BGCOLOR=\"#FFFFFF\" TEXT=\"#000000\">\n<H1>Moved Temporarily</H1>\nThe document has moved <A HREF=\"http://feeds.feedburner.com/thesecretdiaryofstevejobs\">here</A>.\n</BODY>\n</HTML>\n"}
]

View file

@ -11,12 +11,24 @@
"input":"foo</bar>",
"output":[["Character", "foo"], ["EndTag", "bar"]]},
{"description":"End tag closing RCDATA or CDATA (case-insensitivity)",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"bar",
"input":"foo</bAr>",
"output":[["Character", "foo"], ["EndTag", "bar"]]},
{"description":"End tag with incorrect name in RCDATA or CDATA",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"baz",
"input":"</foo>bar</baz>",
"output":[["Character", "</foo>bar"], ["EndTag", "baz"]]},
{"description":"End tag with incorrect name in RCDATA or CDATA (starting like correct name)",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"baz",
"input":"</foo>bar</bazaar>",
"output":[["Character", "</foo>bar</bazaar>"]]},
{"description":"End tag closing RCDATA or CDATA, switching back to PCDATA",
"contentModelFlags":["RCDATA", "CDATA"],
"lastStartTag":"bar",

File diff suppressed because it is too large Load diff

View file

@ -135,7 +135,7 @@
{"description":"Entity without trailing semicolon (2)",
"input":"I'm &notin",
"output":[["Character","I'm "], "ParseError", ["Character", ""]]},
"output":[["Character","I'm "], "ParseError", ["Character", "¬in"]]},
{"description":"Partial entity match at end of file",
"input":"I'm &no",
@ -151,6 +151,22 @@
{"description":"Hexadecimal entity in attribute",
"input":"<h a='&#x3f;'></h>",
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]}
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]},
{"description":"Entity in attribute without semicolon ending in x",
"input":"<h a='&notx'>",
"output":["ParseError", ["StartTag", "h", {"a":"&notx"}]]},
{"description":"Entity in attribute without semicolon ending in 1",
"input":"<h a='&not1'>",
"output":["ParseError", ["StartTag", "h", {"a":"&not1"}]]},
{"description":"Entity in attribute without semicolon ending in i",
"input":"<h a='&noti'>",
"output":["ParseError", ["StartTag", "h", {"a":"&noti"}]]},
{"description":"Entity in attribute without semicolon",
"input":"<h a='&COPY'>",
"output":["ParseError", ["StartTag", "h", {"a":"©"}]]}
]}

View file

@ -42,27 +42,23 @@
{"description":"Numeric entity representing the NUL character",
"input":"&#0000;",
"output":[["Character", "\uFFFD"]]},
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing the NUL character",
"input":"&#x0000;",
"output":[["Character", "\uFFFD"]]},
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#2225222;",
"output":[["Character", "\uFFFD"]]},
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#x1010FFFF;",
"output":[["Character", "\uFFFD"]]},
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Numeric entity representing a Windows-1252 'codepoint'",
"input":"&#137;",
"output":["ParseError", ["Character", "\u2030"]]},
{"description":"Hexadecimal entity representing a Windows-1252 'codepoint'",
"input":"&#x89;",
"output":["ParseError", ["Character", "\u2030"]]},
{"description":"Hexadecimal entity pair representing a surrogate pair",
"input":"&#xD869;&#xDED6;",
"output":["ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity with mixed uppercase and lowercase",
"input":"&#xaBcD;",
@ -118,7 +114,15 @@
{"description":"Null Byte Replacement",
"input":"\u0000",
"output":[["Character", "\ufffd"]]}
"output":["ParseError", ["Character", "\ufffd"]]},
{"description":"Comment with dash",
"input":"<!---x",
"output":["ParseError", ["Comment", "-x"]]},
{"description":"Entity + newline",
"input":"\nx\n&gt;\n",
"output":[["Character","\nx\n>\n"]]}
]}

View file

@ -0,0 +1,367 @@
{"tests": [
{"description":"<",
"input":"<",
"output":["ParseError", ["Character", "<"]]},
{"description":"<>",
"input":"<>",
"output":["ParseError", ["Character", "<>"]]},
{"description":"<!",
"input":"<!",
"output":["ParseError", ["Comment", ""]]},
{"description":"<!>",
"input":"<!>",
"output":["ParseError", ["Comment", ""]]},
{"description":"<!--",
"input":"<!--",
"output":["ParseError", ["Comment", ""]]},
{"description":"<!-->",
"input":"<!-->",
"output":["ParseError", ["Comment", ""]]},
{"description":"<!---",
"input":"<!---",
"output":["ParseError", ["Comment", ""]]},
{"description":"<!--->",
"input":"<!--->",
"output":["ParseError", ["Comment", ""]]},
{"description":"<!---->",
"input":"<!---->",
"output":[["Comment", ""]]},
{"description":"<!-----",
"input":"<!-----",
"output":["ParseError", "ParseError", ["Comment", "-"]]},
{"description":"<!----.",
"input":"<!----.",
"output":["ParseError", "ParseError", ["Comment", "--."]]},
{"description":"<!---?",
"input":"<!---?",
"output":["ParseError", ["Comment", "-?"]]},
{"description":"<!--?-",
"input":"<!--?-",
"output":["ParseError", ["Comment", "?"]]},
{"description":"<!--?--",
"input":"<!--?--",
"output":["ParseError", ["Comment", "?"]]},
{"description":"<!--?-.",
"input":"<!--?-.",
"output":["ParseError", ["Comment", "?-."]]},
{"description":"<!--?.",
"input":"<!--?.",
"output":["ParseError", ["Comment", "?."]]},
{"description":"<?>",
"input":"<?>",
"output":["ParseError", ["Comment", "?"]]},
{"description":"<??",
"input":"<??",
"output":["ParseError", ["Comment", "??"]]},
{"description":"</",
"input":"</",
"output":["ParseError", ["Character", "</"]]},
{"description":"</>",
"input":"</>",
"output":["ParseError"]},
{"description":"</?",
"input":"</?",
"output":["ParseError", ["Comment", "?"]]},
{"description":">",
"input":">",
"output":[["Character", ">"]]},
{"description":"-",
"input":"-",
"output":[["Character", "-"]]},
{"description":"?",
"input":"?",
"output":[["Character", "?"]]},
{"description":"&",
"input":"&",
"output":[["Character", "&"]]},
{"description":"&#",
"input":"&#",
"output":["ParseError", ["Character", "&#"]]},
{"description":"&#9",
"input":"&#9",
"output":["ParseError", ["Character", "\t"]]},
{"description":"<!doctype >",
"input":"<!doctype >",
"output":["ParseError", ["DOCTYPE", "", null, null, false]]},
{"description":"<!doctype ",
"input":"<!doctype ",
"output":["ParseError", ["DOCTYPE", "", null, null, false]]},
{"description":"<!doctype!>",
"input":"<!doctype!>",
"output":["ParseError", ["DOCTYPE", "!", null, null, true]]},
{"description":"<!doctype! >",
"input":"<!doctype! >",
"output":["ParseError", ["DOCTYPE", "!", null, null, true]]},
{"description":"<!doctype! ",
"input":"<!doctype! ",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! ?>",
"input":"<!doctype! ?>",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! ??",
"input":"<!doctype! ??",
"output":["ParseError", "ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype!?",
"input":"<!doctype!?",
"output":["ParseError", "ParseError", ["DOCTYPE", "!?", null, null, false]]},
{"description":"<!doctype! public>",
"input":"<!doctype! public>",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! public ",
"input":"<!doctype! public ",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! public?",
"input":"<!doctype! public?",
"output":["ParseError", "ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! public''",
"input":"<!doctype! public''",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", "", null, false]]},
{"description":"<!doctype! public'(",
"input":"<!doctype! public'(",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", "(", null, false]]},
{"description":"<!doctype! public\"\">",
"input":"<!doctype! public\"\">",
"output":["ParseError", ["DOCTYPE", "!", "", null, true]]},
{"description":"<!doctype! public\"\" ",
"input":"<!doctype! public\"\" ",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", "", null, false]]},
{"description":"<!doctype! public\"\"?",
"input":"<!doctype! public\"\"?",
"output":["ParseError", "ParseError", "ParseError", ["DOCTYPE", "!", "", null, false]]},
{"description":"<!doctype! public\"\"'",
"input":"<!doctype! public\"\"'",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", "", "", false]]},
{"description":"<!doctype! public\"\"\"",
"input":"<!doctype! public\"\"\"",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", "", "", false]]},
{"description":"<!doctype! public\"#",
"input":"<!doctype! public\"#",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", "#", null, false]]},
{"description":"<!doctype! system>",
"input":"<!doctype! system>",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! system ",
"input":"<!doctype! system ",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! system?",
"input":"<!doctype! system?",
"output":["ParseError", "ParseError", "ParseError", ["DOCTYPE", "!", null, null, false]]},
{"description":"<!doctype! system''",
"input":"<!doctype! system''",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, "", false]]},
{"description":"<!doctype! system'(",
"input":"<!doctype! system'(",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, "(", false]]},
{"description":"<!doctype! system\"\">",
"input":"<!doctype! system\"\">",
"output":["ParseError", ["DOCTYPE", "!", null, "", true]]},
{"description":"<!doctype! system\"\" ",
"input":"<!doctype! system\"\" ",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, "", false]]},
{"description":"<!doctype! system\"\"?",
"input":"<!doctype! system\"\"?",
"output":["ParseError", "ParseError", "ParseError", ["DOCTYPE", "!", null, "", false]]},
{"description":"<!doctype! system\"#",
"input":"<!doctype! system\"#",
"output":["ParseError", "ParseError", ["DOCTYPE", "!", null, "#", false]]},
{"description":"</z",
"input":"</z",
"output":["ParseError", ["EndTag", "z"]]},
{"description":"<z>",
"input":"<z>",
"output":[["StartTag", "z", {}]]},
{"description":"<z ",
"input":"<z ",
"output":["ParseError", ["StartTag", "z", {}]]},
{"description":"<z/>",
"input":"<z/>",
"output":["ParseError", ["StartTag", "z", {}]]},
{"description":"<z/ ",
"input":"<z/ ",
"output":["ParseError", "ParseError", ["StartTag", "z", {}]]},
{"description":"<z//",
"input":"<z//",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {}]]},
{"description":"<z",
"input":"<z",
"output":["ParseError", ["StartTag", "z", {}]]},
{"description":"</z",
"input":"</z",
"output":["ParseError", ["EndTag", "z"]]},
{"description":"<z0",
"input":"<z0",
"output":["ParseError", ["StartTag", "z0", {}]]},
{"description":"<z/0=>",
"input":"<z/0=>",
"output":["ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0= ",
"input":"<z/0= ",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0=?>",
"input":"<z/0=?>",
"output":["ParseError", ["StartTag", "z", {"0": "?"}]]},
{"description":"<z/0=? ",
"input":"<z/0=? ",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "?"}]]},
{"description":"<z/0=??",
"input":"<z/0=??",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "??"}]]},
{"description":"<z/0=''",
"input":"<z/0=''",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0='&",
"input":"<z/0='&",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "&"}]]},
{"description":"<z/0='%",
"input":"<z/0='%",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "%"}]]},
{"description":"<z/0=\"'",
"input":"<z/0=\"'",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "'"}]]},
{"description":"<z/0=\"\"",
"input":"<z/0=\"\"",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0=\"&",
"input":"<z/0=\"&",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "&"}]]},
{"description":"<z/0=&",
"input":"<z/0=&",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "&"}]]},
{"description":"<z/0>",
"input":"<z/0>",
"output":["ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0 =",
"input":"<z/0 =",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0 >",
"input":"<z/0 >",
"output":["ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0 ",
"input":"<z/0 ",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0 /",
"input":"<z/0 /",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0/",
"input":"<z/0/",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/00",
"input":"<z/00",
"output":["ParseError", "ParseError", ["StartTag", "z", {"00": ""}]]},
{"description":"<z/0 0",
"input":"<z/0 0",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"0": ""}]]},
{"description":"<z/0='&#9",
"input":"<z/0='&#9",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"0": "\t"}]]},
{"description":"<z/0=\"&#9",
"input":"<z/0=\"&#9",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"0": "\t"}]]},
{"description":"<z/0=&#9",
"input":"<z/0=&#9",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"0": "\t"}]]},
{"description":"<z/0z",
"input":"<z/0z",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0z": ""}]]},
{"description":"<z/0 z",
"input":"<z/0 z",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "", "z": ""}]]},
{"description":"<zz",
"input":"<zz",
"output":["ParseError", ["StartTag", "zz", {}]]},
{"description":"<z/z",
"input":"<z/z",
"output":["ParseError", "ParseError", ["StartTag", "z", {"z": ""}]]}
]}

View file

@ -0,0 +1,198 @@
{"tests": [
{"description":"< in attribute name",
"input":"<z/0 <",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "", "<": ""}]]},
{"description":"< in attribute value",
"input":"<z x=<",
"output":["ParseError", ["StartTag", "z", {"x": "<"}]]},
{"description":"CR EOF after doctype name",
"input":"<!doctype html \r",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"CR EOF in tag name",
"input":"<z\r",
"output":["ParseError", ["StartTag", "z", {}]]},
{"description":"Zero hex numeric entity",
"input":"&#x0",
"output":["ParseError", "ParseError", ["Character", "\uFFFD"]]},
{"description":"Zero decimal numeric entity",
"input":"&#0",
"output":["ParseError", "ParseError", ["Character", "\uFFFD"]]},
{"description":"Zero-prefixed hex numeric entity",
"input":"&#x000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000041;",
"output":[["Character", "A"]]},
{"description":"Zero-prefixed decimal numeric entity",
"input":"&#000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000065;",
"output":[["Character", "A"]]},
{"description":"Empty hex numeric entities",
"input":"&#x &#X ",
"output":["ParseError", ["Character", "&#x "], "ParseError", ["Character", "&#X "]]},
{"description":"Empty decimal numeric entities",
"input":"&# &#; ",
"output":["ParseError", ["Character", "&# "], "ParseError", ["Character", "&#; "]]},
{"description":"Non-BMP numeric entity",
"input":"&#x10000;",
"output":[["Character", "\uD800\uDC00"]]},
{"description":"Maximum non-BMP numeric entity",
"input":"&#X10FFFF;",
"output":[["Character", "\uDBFF\uDFFF"]]},
{"description":"Above maximum numeric entity",
"input":"&#x110000;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"32-bit hex numeric entity",
"input":"&#x80000041;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"33-bit hex numeric entity",
"input":"&#x100000041;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"33-bit decimal numeric entity",
"input":"&#4294967361;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"65-bit hex numeric entity",
"input":"&#x10000000000000041;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"65-bit decimal numeric entity",
"input":"&#18446744073709551681;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Surrogate code point edge cases",
"input":"&#xD7FF;&#xD800;&#xD801;&#xDFFE;&#xDFFF;&#xE000;",
"output":[["Character", "\uD7FF"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD\uE000"]]},
{"description":"Uppercase start tag name",
"input":"<X>",
"output":[["StartTag", "x", {}]]},
{"description":"Uppercase end tag name",
"input":"</X>",
"output":[["EndTag", "x"]]},
{"description":"Uppercase attribute name",
"input":"<x X>",
"output":[["StartTag", "x", { "x":"" }]]},
{"description":"Tag/attribute name case edge values",
"input":"<x@AZ[`az{ @AZ[`az{>",
"output":[["StartTag", "x@az[`az{", { "@az[`az{":"" }]]},
{"description":"Duplicate different-case attributes",
"input":"<x x=1 x=2 X=3>",
"output":["ParseError", "ParseError", ["StartTag", "x", { "x":"1" }]]},
{"description":"Uppercase close tag attributes",
"input":"</x X>",
"output":["ParseError", ["EndTag", "x"]]},
{"description":"Duplicate close tag attributes",
"input":"</x x x>",
"output":["ParseError", "ParseError", ["EndTag", "x"]]},
{"description":"Permitted slash",
"input":"<br/>",
"output":[["StartTag", "br", {}]]},
{"description":"Non-permitted slash",
"input":"<xr/>",
"output":["ParseError", ["StartTag", "xr", {}]]},
{"description":"Permitted slash but in close tag",
"input":"</br/>",
"output":["ParseError", ["EndTag", "br"]]},
{"description":"Doctype public case-sensitivity (1)",
"input":"<!DoCtYpE HtMl PuBlIc \"AbC\" \"XyZ\">",
"output":[["DOCTYPE", "HtMl", "AbC", "XyZ", true]]},
{"description":"Doctype public case-sensitivity (2)",
"input":"<!dOcTyPe hTmL pUbLiC \"aBc\" \"xYz\">",
"output":[["DOCTYPE", "hTmL", "aBc", "xYz", true]]},
{"description":"Doctype system case-sensitivity (1)",
"input":"<!DoCtYpE HtMl SyStEm \"XyZ\">",
"output":[["DOCTYPE", "HtMl", null, "XyZ", true]]},
{"description":"Doctype system case-sensitivity (2)",
"input":"<!dOcTyPe hTmL sYsTeM \"xYz\">",
"output":[["DOCTYPE", "hTmL", null, "xYz", true]]},
{"description":"U+0000 in lookahead region after non-matching character",
"input":"<!doc>\u0000",
"output":["ParseError", ["Comment", "doc"], "ParseError", ["Character", "\uFFFD"]],
"ignoreErrorOrder":true},
{"description":"U+0000 in lookahead region",
"input":"<!doc\u0000",
"output":["ParseError", "ParseError", ["Comment", "doc\uFFFD"]],
"ignoreErrorOrder":true},
{"description":"CR followed by U+0000",
"input":"\r\u0000",
"output":["ParseError", ["Character", "\n\uFFFD"]],
"ignoreErrorOrder":true},
{"description":"CR followed by non-LF",
"input":"\r?",
"output":[["Character", "\n?"]]},
{"description":"CR at EOF",
"input":"\r",
"output":[["Character", "\n"]]},
{"description":"LF at EOF",
"input":"\n",
"output":[["Character", "\n"]]},
{"description":"CR LF",
"input":"\r\n",
"output":[["Character", "\n"]]},
{"description":"CR CR",
"input":"\r\r",
"output":[["Character", "\n\n"]]},
{"description":"LF LF",
"input":"\n\n",
"output":[["Character", "\n\n"]]},
{"description":"LF CR",
"input":"\n\r",
"output":[["Character", "\n\n"]]},
{"description":"text CR CR CR text",
"input":"text\r\r\rtext",
"output":[["Character", "text\n\n\ntext"]]},
{"description":"Doctype publik",
"input":"<!DOCTYPE html PUBLIK \"AbC\" \"XyZ\">",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Doctype publi",
"input":"<!DOCTYPE html PUBLI",
"output":["ParseError", "ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Doctype sistem",
"input":"<!DOCTYPE html SISTEM \"AbC\">",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Doctype sys",
"input":"<!DOCTYPE html SYS",
"output":["ParseError", "ParseError", ["DOCTYPE", "html", null, null, false]]}
]}

View file

@ -113,7 +113,6 @@ Line1<br>Line2<br>Line3<br>Line4
<html><head></body></html>
#errors
6: missing document type declaration
19: unexpected body element end tag in head
#document
| <html>
| <head>
@ -159,7 +158,6 @@ Line1<br>Line2<br>Line3<br>Line4
</head>
#errors
7: missing document type declaration
7: unexpected head element end tag
#document
| <html>
| <head>
@ -169,7 +167,6 @@ Line1<br>Line2<br>Line3<br>Line4
</body>
#errors
7: missing document type declaration
7: unexpected body element end tag
#document
| <html>
| <head>
@ -285,6 +282,7 @@ Line1<br>Line2<br>Line3<br>Line4
| <div>
| <b>
| <marquee>
| <p>
| "X"
#data
@ -330,6 +328,7 @@ Unexpected end of file
| <body>
| <p>
| <hr>
| <p>
#data
<select><b><option><select><option></b></select>X
@ -435,6 +434,7 @@ Unexpected end of file
#data
<!DOCTYPE HTML><li>hello<li>world<ul>how<li>do</ul>you</body><!--do-->
#errors
Unexpected end of file. Expected </li>. XXX
#document
| <!DOCTYPE HTML>
| <html>
@ -636,7 +636,6 @@ Unexpected end of file
#data
<!DOCTYPE HTML><script> <!-- </script> --> </script> EOF
#errors
52: unexpected script element end tag
#document
| <!DOCTYPE HTML>
| <html>
@ -730,6 +729,7 @@ Unexpected end of file
#errors
6: missing document type declaration
29: mismatched font element end tag (misnested tags)
AAA </font> tag strikes again
35: mismatched body element end tag (premature end of file?)
#document
| <html>
@ -1120,6 +1120,7 @@ Unexpected end of file
15: missing document type declaration
39: unexpected node in table context
39: a element start tag implying a element end tag
AAA violation: </a>
39: unexpected node in table context
39: mismatched a element end tag (misnested tags across <table> tag)
43: unexpected node in table context
@ -1175,6 +1176,8 @@ Unexpected end of file
7: missing document type declaration
22: unexpected node in table context
27: unexpected node in table context
XXX more table voodoo
XXX more table voodoo
54: unexpected td element end tag implied other end tags
63: unexpected node in table context
72: mismatched body element end tag (premature end of file?)
@ -1299,11 +1302,9 @@ unexpected EOF
#errors
6: missing document type declaration
12: unexpected body element start tag
18: base element start tag out of place
24: link element start tag out of place
30: meta element start tag out of place
37: title element start tag out of place
54: unexpected body element start tag
Missing end tag </p>. XXX
#document
| <html>
| <head>
@ -1344,7 +1345,6 @@ unexpected EOF
3: missing document type declaration
13: unexpected node in table context
13: a element start tag implying a element end tag
13: unexpected node in table context
13: mismatched a element end tag (misnested tags across <table> tag)
21: mismatched table element end tag
27: a element start tag implying a element end tag
@ -1369,13 +1369,14 @@ unexpected EOF
<head></p><meta><p>
#errors
6: missing document type declaration
10: unexpected p element end tag in head
10: unexpected p element end tag
#document
| <html>
| <head>
| <meta>
| <body>
| <p>
| <meta>
| <p>
#data
<head></html><meta><p>
@ -1485,6 +1486,7 @@ unexpected EOF
| <div>
| <b>
| <marquee>
| <p>
#data
<script></script></div><title></title><p><p>
@ -1511,6 +1513,7 @@ unexpected EOF
| <body>
| <p>
| <hr>
| <p>
#data
<select><b><option><select><option></b></select>
@ -1571,6 +1574,8 @@ unexpected EOF
<ul><li></li><div><li></div><li><li><div><li><address><li><b><em></b><li></ul>
#errors
4: missing document type declaration
Missing end tag for <div> (nr2)
Missing end tag for <address>
69: mismatched b element end tag (misnested tags)
#document
| <html>
@ -1615,7 +1620,6 @@ unexpected EOF
56: unexpected frameset element start tag in body
63: unexpected frame element start tag in body
74: unexpected frameset element end tag
87: unescaped '</' in CDATA or RCDATA block
106: unexpected end of file while parsing CDATA section for element noframes
#document
| <html>
@ -1630,6 +1634,7 @@ unexpected EOF
4: missing document type declaration
15: required tr element start tag implied by unexpected td element start tag
27: unexpected td element end tag implied other end tags
Unexpected </h1> tag. Expected other.
Unexpected EOF
#document
| <html>
@ -1737,9 +1742,9 @@ Unexpected EOF
108: unexpected h4 element end tag
113: unexpected h5 element end tag
118: unexpected h6 element end tag
125: unexpected body element end tag
125: unexpected end tag token br in after body phase
130: unexpected br element end tag
134: unexpected a element end tag
134: unexpected a element end tag (AAA)
140: unexpected img element end tag
148: unexpected title element end tag
155: unexpected span element end tag
@ -1807,6 +1812,7 @@ Unexpected EOF
| <head>
| <body>
| <br>
| <p>
#data
<table><tr></strong></b></em></i></u></strike></s></blink></tt></pre></big></small></font></select></h1></h2></h3></h4></h5></h6></body></br></a></img></title></span></style></script></table></th></td></tr></frame></area></link></param></hr></input></col></base></meta></basefont></bgsound></embed></spacer></p></dd></dt></caption></colgroup></tbody></tfoot></thead></address></blockquote></center></dir></div></dl></fieldset></listing></menu></ol></ul></li></nobr></wbr></form></button></marquee></object></html></frameset></head></iframe></image></isindex></noembed></noframes></noscript></optgroup></option></plaintext></textarea>
@ -1920,6 +1926,9 @@ Unexpected EOF
610: unexpected option element end tag
622: unexpected plaintext element end tag
633: mismatched special end tag textarea
XXX
XXX
XXX
#document
| <html>
| <head>
@ -1928,3 +1937,14 @@ Unexpected EOF
| <table>
| <tbody>
| <tr>
| <p>
#data
<frameset>
#errors
10: Start tag seen without seeing a doctype first.
11: End of file seen and there were open elements.
#document
| <html>
| <head>
| <frameset>

View file

@ -12,7 +12,6 @@
<textarea>test</div>test
#errors
10: missing document type declaration.
17: unescaped '</' in CDATA or RCDATA block.
25: unexpected end of file while parsing CDATA section for element textarea.
#document
| <html>
@ -87,6 +86,8 @@ Expected end tag </frameset>
#data
<!DOCTYPE HTML><font><p><b>test</font>
#errors
AAA violation. </font>
AAA violation. </font>
#document
| <!DOCTYPE HTML>
| <html>
@ -101,6 +102,7 @@ Expected end tag </frameset>
#data
<!DOCTYPE HTML><dt><div><dd>
#errors
Missing end tag for <div>.
#document
| <!DOCTYPE HTML>
| <html>
@ -114,7 +116,6 @@ Expected end tag </frameset>
<script></x
#errors
no document type
</ in script
Unexpected end of file. Expected </script> end tag.
#document
| <html>
@ -129,6 +130,7 @@ Unexpected end of file. Expected </script> end tag.
no document type
<plaintext> directly inside table
Characters inside table.
Characters inside table. (XXX?)
Unexpected end of file.
#document
| <html>
@ -175,10 +177,10 @@ Unexpected start tag "body"
| <html>
| <head>
| <body>
| t4="4"
| t1="1"
| t2="2"
| t3="3"
| t1="1"
| t4="4"
#data
</b test
@ -195,7 +197,6 @@ Unexpected end tag.
#data
<!DOCTYPE HTML></b test<b &=&amp>X
#errors
Unexpected < in attribute
End tag contains attributes.
Unexpected end tag.
Named entity didn't end with ;
@ -224,7 +225,6 @@ Unexpected EOF in (end) tag name
&
#errors
No doctype.
Unfinished entity.
#document
| <html>
| <head>
@ -349,11 +349,11 @@ Unexpected end EOF. Missing closing tags.
| <b>
| <i>
| <u>
| " "
| <p>
| <b>
| <i>
| <u>
| <b>
| <i>
| <u>
| " "
| <p>
| "X"
#data
@ -538,10 +538,10 @@ No doctype
| <hr>
| <p>
| <label>
| "This is a searchable index. Insert your search keywords here:"
| "This is a searchable index. Insert your search keywords here: "
| <input>
| test="x"
| name="isindex"
| test="x"
| <hr>
#data
@ -571,19 +571,18 @@ Unexpected EOF.
| <b>
| <i>
| <u>
| "
| <b>
| <i>
| <u>
| "
"
| <p>
| <b>
| <i>
| <u>
| <p>
| "X"
#data
<!DOCTYPE HTML><body><title>test</body></title>
#errors
Unexpected start tag that belongs in the head.
Expected closing tag after </.
#document
| <!DOCTYPE HTML>
| <html>
@ -596,10 +595,7 @@ Expected closing tag after </.
<!DOCTYPE HTML><body><title>X</title><meta name=z><link rel=foo><style>
x { content:"</style" } </style>
#errors
Unexpected start tag that belongs in head.
Unexpected start tag that belongs in head.
Unexpected start tag that belongs in head.
Expected closing tag after </.
Unexpected start tag that belongs in head. <title>
#document
| <!DOCTYPE HTML>
| <html>
@ -632,8 +628,6 @@ x { content:"</style" } "
#errors
No doctype.
#document
| "
"
| <html>
| <head>
| <body>
@ -643,7 +637,6 @@ No doctype.
#errors
#document
| <!DOCTYPE HTML>
| " "
| <html>
| <head>
| <body>
@ -749,8 +742,8 @@ Solidus (/) incorrectly placed.
| <body>
| "X"
| <p>
| y=""
| x=""
| y=""
| z=""
#data
@ -777,3 +770,4 @@ Unexpected </p> end tag.
| <tbody>
| <tr>
| <td>
| <p>

View file

@ -61,7 +61,6 @@ No DOCTYPE
#data
<!DOCTYPE htML><html><head></head><body><pre>
foo</pre></body></html>
#errors
#document
@ -72,10 +71,22 @@ foo</pre></body></html>
| <pre>
| "foo"
#data
<!DOCTYPE htML><html><head></head><body><pre>
foo</pre></body></html>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <pre>
| "
foo"
#data
<!DOCTYPE htML><html><head></head><body><pre>
foo
</pre></body></html>
#errors
@ -120,6 +131,7 @@ y"
<!DOCTYPE htML><html><head></head><body><pre>x<div>
y</pre></body></html>
#errors
End tag <pre> seen too early. Expected other end tag.
#document
| <!DOCTYPE htML>
| <html>
@ -129,11 +141,12 @@ y</pre></body></html>
| "x"
| <div>
| "
| y"
y"
#data
<!DOCTYPE htML><HTML><META><HEAD></HEAD></HTML>
#errors
Unexpected start tag HEAD in HEAD. Ignored.
#document
| <!DOCTYPE htML>
| <html>
@ -144,6 +157,7 @@ y</pre></body></html>
#data
<!DOCTYPE htML><HTML><HEAD><head></HEAD></HTML>
#errors
Unexpected start tag HEAD in HEAD. Ignored.
#document
| <!DOCTYPE htML>
| <html>
@ -153,6 +167,8 @@ y</pre></body></html>
#data
<textarea>foo<span>bar</span><i>baz
#errors
Unexpected start tag. Expected DOCTYPE.
Unexpected end of file.
#document
| <html>
| <head>
@ -163,6 +179,8 @@ y</pre></body></html>
#data
<title>foo<span>bar</em><i>baz
#errors
Unexpected start tag. Expected DOCTYPE.
Unexpected end of file.
#document
| <html>
| <head>
@ -183,7 +201,6 @@ y</pre></body></html>
#data
<!DOCTYPE htML><textarea>
foo</textarea>
#errors
#document
@ -194,6 +211,20 @@ foo</textarea>
| <textarea>
| "foo"
#data
<!DOCTYPE htML><textarea>
foo</textarea>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <textarea>
| "
foo"
#data
<!DOCTYPE htML><html><head></head><body><ul><li><div><p><li></ul></body></html>
#errors
@ -212,6 +243,8 @@ Missing end tag (div)
#data
<!doctype html><nobr><nobr><nobr>
#errors
Unexpected <nobr> tag.
Unexpected <nobr> tag.
Unexpected end of file.
#document
| <!DOCTYPE html>
@ -225,6 +258,7 @@ Unexpected end of file.
#data
<!doctype html><nobr><nobr></nobr><nobr>
#errors
Unexpected <nobr> tag.
Unexpected end of file.
#document
| <!DOCTYPE html>

View file

@ -1,37 +1,50 @@
#data
direct div content
#errors
#document-fragment div
#document-fragment
div
#document
| "direct div content"
#data
direct textarea content
#errors
#document-fragment textarea
#document-fragment
textarea
#document
| "direct textarea content"
#data
textarea content with <em>pseudo</em> <foo>markup
#errors
#document-fragment textarea
#document-fragment
textarea
#document
| "textarea content with <em>pseudo</em> <foo>markup"
#data
this is &#x0043;DATA inside a <style> element
#errors
#document-fragment style
#document-fragment
style
#document
| "this is &#x0043;DATA inside a <style> element"
#data
</plaintext>
#errors
#document-fragment plaintext
#document-fragment
plaintext
#document
| "</plaintext>"
#data
setting html's innerHTML
#errors
#document-fragment html
XXX innerHTML EOF
#document-fragment
html
#document
| <head>
| <body>
| "setting html's innerHTML"
@ -39,6 +52,9 @@ setting html's innerHTML
#data
<title>setting head's innerHTML</title>
#errors
#document-fragment head
Unexpected title element that belongs in head.
#document-fragment
head
#document
| <title>
| "setting head's innerHTML"

View file

@ -110,7 +110,6 @@ No DOCTYPE
<style> <!</-- </style>x
#errors
No DOCTYPE
Unexpected end of file
#document
| <html>
| <head>
@ -118,3 +117,59 @@ Unexpected end of file
| " <!</-- "
| <body>
| "x"
#data
<xmp> <!-- > --> </xmp>
#errors
No DOCTYPE
#document
| <html>
| <head>
| <body>
| <xmp>
| " <!-- > --> "
#data
<title>&amp;</title>
#errors
No DOCTYPE
#document
| <html>
| <head>
| <title>
| "&"
| <body>
#data
<title><!--&amp;--></title>
#errors
No DOCTYPE
#document
| <html>
| <head>
| <title>
| "<!--&amp;-->"
| <body>
#data
<title><!--</title>
#errors
No DOCTYPE
Unexpected EOF
#document
| <html>
| <head>
| <title>
| "<!--</title>"
| <body>
#data
<noscript><!--</noscript>--></noscript>
#errors
No DOCTYPE
#document
| <html>
| <head>
| <noscript>
| "<!--</noscript>-->"
| <body>

View file

@ -1,6 +1,7 @@
#data
<!doctype html></head> <head>
#errors
Unexpected start tag head. Ignored.
#document
| <!DOCTYPE html>
| <html>
@ -11,6 +12,9 @@
#data
<!doctype html></html> <head>
#errors
Unexpected start tag head.
Unexpected start tag head in after body phase.
Unexpected start tag head. Ignored.
#document
| <!DOCTYPE html>
| <html>
@ -21,9 +25,69 @@
#data
<!doctype html></body><meta>
#errors
Unexpected meta element in after body phase.
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| <meta>
#data
<!doctype HTml><form><div></form><div>
#errors
Form end tag ignored.
Unexpected end of file.
#document
| <!DOCTYPE HTml>
| <html>
| <head>
| <body>
| <form>
| <div>
| <div>
#data
<!doctype HTml><title>&amp;</title>
#errors
#document
| <!DOCTYPE HTml>
| <html>
| <head>
| <title>
| "&"
| <body>
#data
<!doctype HTml><title><!--&amp;--></title>
#errors
#document
| <!DOCTYPE HTml>
| <html>
| <head>
| <title>
| "<!--&amp;-->"
| <body>
#data
<!doctype>
#errors
No space after "doctype"
Unexpected ">"
Incorrect doctype
#document
| <!DOCTYPE >
| <html>
| <head>
| <body>
#data
<!---x
#errors
End of file in comment
End of file before doctype
#document
| <!-- -x -->
| <html>
| <head>
| <body>

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,159 @@
{"tests": [
{"description": "valid single class attribute value",
"input": "<span class=a>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading space",
"input": "<span class=' a'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with trailing space",
"input": "<span class='a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading and trailing space",
"input": "<span class=' a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading tab",
"input": "<span class=' a'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with trailing tab",
"input": "<span class='a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading and trailing tab",
"input": "<span class=' a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading LF",
"input": "<span class='
a'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with trailing LF",
"input": "<span class='a
'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading and trailing LF",
"input": "<span class='
a
'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading LT",
"input": "<span class=' a'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with trailing LT",
"input": "<span class='a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading and trailing LT",
"input": "<span class=' a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading FF",
"input": "<span class=' a'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with trailing FF",
"input": "<span class='a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading and trailing FF",
"input": "<span class=' a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading CR",
"input": "<span class=' a'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with trailing CR",
"input": "<span class='a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid single class attribute value with leading and trailing CR",
"input": "<span class=' a '>",
"fail-if": "invalid-attribute-value"},
{"description": "valid double class attribute value separated by space",
"input": "<span class='a b'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid double class attribute value separated by tab",
"input": "<span class='a b'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid double class attribute value separated by LF",
"input": "<span class='a
b'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid double class attribute value separated by LT",
"input": "<span class='a b'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid double class attribute value separated by FF",
"input": "<span class='a b'>",
"fail-if": "invalid-attribute-value"},
{"description": "valid double class attribute value separated by CR",
"input": "<span class='a b'>",
"fail-if": "invalid-attribute-value"},
{"description": "invalid duplicated class attribute value separated by space",
"input": "<span class='a a'>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid duplicated class attribute value separated by tab",
"input": "<span class='a a'>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid duplicated class attribute value separated by LF",
"input": "<span class='a
a'>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid duplicated class attribute value separated by LT",
"input": "<span class='a a'>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid duplicated class attribute value separated by FF",
"input": "<span class='a a'>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid duplicated class attribute value separated by CR",
"input": "<span class='a a'>",
"fail-unless": "duplicate-value-in-token-list"},
{"description": "invalid duplicated class attribute value separated by space",
"input": "<span class='a a'>",
"fail-unless": "duplicate-value-in-token-list"},
{"description": "invalid duplicated class attribute value separated by tab",
"input": "<span class='a a'>",
"fail-unless": "duplicate-value-in-token-list"},
{"description": "invalid duplicated class attribute value separated by LF",
"input": "<span class='a
a'>",
"fail-unless": "duplicate-value-in-token-list"},
{"description": "invalid duplicated class attribute value separated by LT",
"input": "<span class='a a'>",
"fail-unless": "duplicate-value-in-token-list"},
{"description": "invalid duplicated class attribute value separated by FF",
"input": "<span class='a a'>",
"fail-unless": "duplicate-value-in-token-list"},
{"description": "invalid duplicated class attribute value separated by CR",
"input": "<span class='a a'>",
"fail-unless": "duplicate-value-in-token-list"}
]}

View file

@ -0,0 +1,59 @@
{"tests": [
{"description": "valid contenteditable attribute value 'true'",
"input": "<span contenteditable=true>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value 'TRUE'",
"input": "<span contenteditable=TRUE>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value 'TrUe'",
"input": "<span contenteditable=TrUe>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value 'false'",
"input": "<span contenteditable=false>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value 'FALSE'",
"input": "<span contenteditable=FALSE>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value 'FalSe'",
"input": "<span contenteditable=FalSe>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value ''",
"input": "<span contenteditable=''>",
"fail-if": "invalid-attribute-value"},
{"description": "valid contenteditable attribute value (not specified)",
"input": "<span contenteditable>",
"fail-if": "invalid-attribute-value"},
{"description": "invalid contenteditable attribute value 'foo'",
"input": "<span contenteditable=foo>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid contenteditable attribute value '0'",
"input": "<span contenteditable=0>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid contenteditable attribute value '1'",
"input": "<span contenteditable=1>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid contenteditable attribute value 'yes'",
"input": "<span contenteditable=yes>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid contenteditable attribute value 'no'",
"input": "<span contenteditable=no>",
"fail-unless": "invalid-attribute-value"},
{"description": "invalid contenteditable attribute value 'inherit'",
"input": "<span contenteditable=inherit>",
"fail-unless": "invalid-attribute-value"}
]}

View file

@ -0,0 +1,118 @@
{"tests": [
{"description": "contextmenu points to valid ID earlier",
"input": "<menu id=a><span contextmenu=a>",
"fail-if": "id-does-not-exist"},
{"description": "contextmenu points to valid ID later",
"input": "<span contextmenu=a><menu id=a>",
"fail-if": "id-does-not-exist"},
{"description": "contextmenu points to non-existent ID",
"input": "<span contextmenu=a>",
"fail-unless": "id-does-not-exist"},
{"description": "contextmenu points to ID on non-menu element",
"input": "<span id=a><span contextmenu=a>",
"fail-unless": "contextmenu-must-point-to-menu"},
{"description": "uppercase contextmenu points to ID on non-menu element",
"input": "<span id=a><span CONTEXTMENU=a>",
"fail-unless": "contextmenu-must-point-to-menu"},
{"description": "valid ID 'a'",
"input": "<span contextmenu=a>",
"fail-if": "invalid-attribute-value"},
{"description": "valid ID '1'",
"input": "<span contextmenu=1>",
"fail-if": "invalid-attribute-value"},
{"description": "wacky but valid ID",
"input": "<span contextmenu='<html><head><title>a</title></head><body><p>b</p></body></html>'>",
"fail-if": "invalid-attribute-value"},
{"description": "invalid blank ID",
"input": "<span id>",
"fail-unless": "attribute-value-can-not-be-blank"},
{"description": "invalid blank ID with quotes",
"input": "<span contextmenu=''>",
"fail-unless": "attribute-value-can-not-be-blank"},
{"description": "invalid ID because of leading space",
"input": "<span contextmenu=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing space",
"input": "<span contextmenu='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of space in value",
"input": "<span contextmenu='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading tab",
"input": "<span contextmenu=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing tab",
"input": "<span contextmenu='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of tab in value",
"input": "<span contextmenu='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading LF",
"input": "<span contextmenu='
a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing LF",
"input": "<span contextmenu='a
'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of LF in value",
"input": "<span contextmenu='a
b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading LT",
"input": "<span contextmenu=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing LT",
"input": "<span contextmenu='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of LT in value",
"input": "<span contextmenu='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading FF",
"input": "<span contextmenu=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing FF",
"input": "<span contextmenu='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of FF in value",
"input": "<span contextmenu='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading CR",
"input": "<span contextmenu=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing CR",
"input": "<span contextmenu='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of CR in value",
"input": "<span contextmenu='a b'>",
"fail-unless": "space-in-id"}
]}

View file

@ -0,0 +1,118 @@
{"tests": [
{"description": "valid ID 'a'",
"input": "<span id=a>",
"fail-if": "invalid-attribute-value"},
{"description": "valid ID '1'",
"input": "<span id=1>",
"fail-if": "invalid-attribute-value"},
{"description": "wacky but valid ID",
"input": "<span id='<html><head><title>a</title></head><body><p>b</p></body></html>'>",
"fail-if": "invalid-attribute-value"},
{"description": "invalid blank ID",
"input": "<span id>",
"fail-unless": "attribute-value-can-not-be-blank"},
{"description": "invalid blank ID with quotes",
"input": "<span id=''>",
"fail-unless": "attribute-value-can-not-be-blank"},
{"description": "invalid ID because of leading space",
"input": "<span id=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing space",
"input": "<span id='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of space in value",
"input": "<span id='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading tab",
"input": "<span id=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing tab",
"input": "<span id='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of tab in value",
"input": "<span id='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading LF",
"input": "<span id='
a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing LF",
"input": "<span id='a
'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of LF in value",
"input": "<span id='a
b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading LT",
"input": "<span id=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing LT",
"input": "<span id='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of LT in value",
"input": "<span id='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading FF",
"input": "<span id=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing FF",
"input": "<span id='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of FF in value",
"input": "<span id='a b'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of leading CR",
"input": "<span id=' a'>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of trailing CR",
"input": "<span id='a '>",
"fail-unless": "space-in-id"},
{"description": "invalid ID because of CR in value",
"input": "<span id='a b'>",
"fail-unless": "space-in-id"},
{"description": "duplicate ID values",
"input": "<span id=a><span id=a>",
"fail-unless": "duplicate-id"},
{"description": "duplicate ID values with spaces (weird but true)",
"input": "<span id='a '><span id='a '>",
"fail-unless": "duplicate-id"},
{"description": "not duplicate ID values because spaces don't match",
"input": "<span id=a><span id='a '>",
"fail-if": "duplicate-id"},
{"description": "not duplicate ID values because spaces don't match",
"input": "<span id=' a'><span id='a '>",
"fail-if": "duplicate-id"},
{"description": "not duplicate ID values because case doesn't match",
"input": "<span id=a><span id=A>",
"fail-if": "duplicate-id"}
]}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,375 @@
{"tests": [
{"description": "unknown start tag <foo>",
"input": "<foo>",
"fail-unless": "unknown-start-tag"},
{"description": "allowed start tag <code>",
"input": "<code>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <kbd>",
"input": "<kbd>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <aside>",
"input": "<aside>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <datagrid>",
"input": "<datagrid>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <font>",
"input": "<font>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <noscript>",
"input": "<noscript>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <style>",
"input": "<style>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <img>",
"input": "<img>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <title>",
"input": "<title>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <menu>",
"input": "<menu>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <tr>",
"input": "<tr>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <param>",
"input": "<param>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <li>",
"input": "<li>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <source>",
"input": "<source>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <tfoot>",
"input": "<tfoot>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <th>",
"input": "<th>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <td>",
"input": "<td>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <dl>",
"input": "<dl>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <blockquote>",
"input": "<blockquote>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <dd>",
"input": "<dd>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <abbr>",
"input": "<abbr>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <dt>",
"input": "<dt>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <noembed>",
"input": "<noembed>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <p>",
"input": "<p>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <small>",
"input": "<small>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <meter>",
"input": "<meter>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <em>",
"input": "<em>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <meta>",
"input": "<meta>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <video>",
"input": "<video>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <div>",
"input": "<div>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <canvas>",
"input": "<canvas>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <sub>",
"input": "<sub>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <section>",
"input": "<section>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <sup>",
"input": "<sup>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <progress>",
"input": "<progress>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <body>",
"input": "<body>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <base>",
"input": "<base>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <br>",
"input": "<br>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <address>",
"input": "<address>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <article>",
"input": "<article>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <strong>",
"input": "<strong>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <legend>",
"input": "<legend>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <event-source>",
"input": "<event-source>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <ol>",
"input": "<ol>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <script>",
"input": "<script>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <caption>",
"input": "<caption>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <dialog>",
"input": "<dialog>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <col>",
"input": "<col>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <h2>",
"input": "<h2>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <h3>",
"input": "<h3>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <h1>",
"input": "<h1>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <h6>",
"input": "<h6>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <h4>",
"input": "<h4>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <h5>",
"input": "<h5>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <header>",
"input": "<header>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <table>",
"input": "<table>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <span>",
"input": "<span>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <area>",
"input": "<area>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <dfn>",
"input": "<dfn>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <var>",
"input": "<var>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <cite>",
"input": "<cite>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <thead>",
"input": "<thead>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <head>",
"input": "<head>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <hr>",
"input": "<hr>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <link>",
"input": "<link>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <datatemplate>",
"input": "<datatemplate>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <b>",
"input": "<b>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <colgroup>",
"input": "<colgroup>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <ul>",
"input": "<ul>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <del>",
"input": "<del>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <iframe>",
"input": "<iframe>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <pre>",
"input": "<pre>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <figure>",
"input": "<figure>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <ins>",
"input": "<ins>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <tbody>",
"input": "<tbody>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <html>",
"input": "<html>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <nav>",
"input": "<nav>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <details>",
"input": "<details>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <samp>",
"input": "<samp>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <map>",
"input": "<map>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <nest>",
"input": "<nest>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <object>",
"input": "<object>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <a>",
"input": "<a>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <footer>",
"input": "<footer>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <i>",
"input": "<i>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <m>",
"input": "<m>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <rule>",
"input": "<rule>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <q>",
"input": "<q>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <command>",
"input": "<command>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <time>",
"input": "<time>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <audio>",
"input": "<audio>",
"fail-if": "unknown-start-tag"},
{"description": "allowed start tag <bdo>",
"input": "<bdo>",
"fail-if": "unknown-start-tag"}
]}

View file

@ -1,81 +1,70 @@
require 'test/unit'
HTML5LIB_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
if File.exists?(File.join(HTML5LIB_BASE, 'testdata'))
TESTDATA_DIR = File.join(HTML5LIB_BASE, 'testdata')
else
TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
end
$:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
$:.unshift File.dirname(__FILE__)
def html5lib_test_files(subdirectory)
Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
end
begin
require 'rubygems'
require 'json'
rescue LoadError
class JSON
def self.parse json
json.gsub!(/"\s*:/, '"=>')
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
null = nil
eval json
end
end
end
module HTML5lib
module TestSupport
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
match.split("\n").sort.join("\n")
end
end
end
end
require 'test/unit'
HTML5_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
if File.exists?(File.join(HTML5_BASE, 'testdata'))
TESTDATA_DIR = File.join(HTML5_BASE, 'testdata')
else
TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
end
$:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
$:.unshift File.dirname(__FILE__)
def html5_test_files(subdirectory)
Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
end
require 'rubygems'
require 'json'
module HTML5
module TestSupport
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
match.split("\n").sort.join("\n")
end
end
class TestData
include Enumerable
def initialize(filename, sections)
@f = open(filename)
@sections = sections
end
def each
data = {}
key=nil
@f.each_line do |line|
if line[0] == ?# and @sections.include?(line[1..-2])
heading = line[1..-2]
if data.any? and heading == @sections[0]
data[key].chomp! #Remove trailing newline
yield normaliseOutput(data)
data = {}
end
key = heading
data[key]=""
elsif key
data[key] += line
end
end
yield normaliseOutput(data) if data
end
def normaliseOutput(data)
#Remove trailing newlines
data.keys.each { |key| data[key].chomp! }
@sections.map {|heading| data[heading]}
end
end
end
end

View file

@ -1,8 +1,10 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/inputstream'
require 'html5/inputstream'
class Html5EncodingTestCase < Test::Unit::TestCase
include HTML5
include TestSupport
begin
require 'rubygems'
@ -10,23 +12,21 @@ class Html5EncodingTestCase < Test::Unit::TestCase
def test_chardet
file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r')
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
stream = HTML5::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.char_encoding.downcase
rescue LoadError
puts "chardet not found, skipping chardet tests"
end
end
html5lib_test_files('encoding').each do |test_file|
html5_test_files('encoding').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
input, encoding = data.split(/\n#encoding\s+/, 2)
encoding = encoding.split[0]
TestData.new(test_file, %w(data encoding)).
each_with_index do |(input, encoding), index|
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
stream = HTML5::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.char_encoding.downcase, input
end
end

Some files were not shown because too many files have changed in this diff Show more