Sync with latest HTML5lib and latest Maruku
This commit is contained in:
parent
8e92e4a3ab
commit
8ccaad85a5
71 changed files with 1974 additions and 1621 deletions
|
@ -25,14 +25,14 @@
|
||||||
|
|
||||||
module Sanitize
|
module Sanitize
|
||||||
|
|
||||||
require 'html5lib/html5parser'
|
require 'html5/html5parser'
|
||||||
require 'html5lib/liberalxmlparser'
|
require 'html5/liberalxmlparser'
|
||||||
require 'html5lib/treewalkers'
|
require 'html5/treewalkers'
|
||||||
require 'html5lib/treebuilders'
|
require 'html5/treebuilders'
|
||||||
require 'html5lib/serializer'
|
require 'html5/serializer'
|
||||||
require 'html5lib/sanitizer'
|
require 'html5/sanitizer'
|
||||||
|
|
||||||
include HTML5lib
|
include HTML5
|
||||||
|
|
||||||
# Sanitize a string, parsed using XHTML parsing rules.
|
# Sanitize a string, parsed using XHTML parsing rules.
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser'
|
require 'html5/html5parser'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
def self.parse(stream, options={})
|
def self.parse(stream, options={})
|
||||||
HTMLParser.parse(stream, options)
|
HTMLParser.parse(stream, options)
|
||||||
end
|
end
|
817
vendor/plugins/HTML5lib/lib/html5/constants.rb
vendored
Executable file
817
vendor/plugins/HTML5lib/lib/html5/constants.rb
vendored
Executable file
|
@ -0,0 +1,817 @@
|
||||||
|
module HTML5
|
||||||
|
|
||||||
|
class EOF < Exception; end
|
||||||
|
|
||||||
|
CONTENT_MODEL_FLAGS = [
|
||||||
|
:PCDATA,
|
||||||
|
:RCDATA,
|
||||||
|
:CDATA,
|
||||||
|
:PLAINTEXT
|
||||||
|
]
|
||||||
|
|
||||||
|
SCOPING_ELEMENTS = %w[
|
||||||
|
button
|
||||||
|
caption
|
||||||
|
html
|
||||||
|
marquee
|
||||||
|
object
|
||||||
|
table
|
||||||
|
td
|
||||||
|
th
|
||||||
|
]
|
||||||
|
|
||||||
|
FORMATTING_ELEMENTS = %w[
|
||||||
|
a
|
||||||
|
b
|
||||||
|
big
|
||||||
|
em
|
||||||
|
font
|
||||||
|
i
|
||||||
|
nobr
|
||||||
|
s
|
||||||
|
small
|
||||||
|
strike
|
||||||
|
strong
|
||||||
|
tt
|
||||||
|
u
|
||||||
|
]
|
||||||
|
|
||||||
|
SPECIAL_ELEMENTS = %w[
|
||||||
|
address
|
||||||
|
area
|
||||||
|
base
|
||||||
|
basefont
|
||||||
|
bgsound
|
||||||
|
blockquote
|
||||||
|
body
|
||||||
|
br
|
||||||
|
center
|
||||||
|
col
|
||||||
|
colgroup
|
||||||
|
dd
|
||||||
|
dir
|
||||||
|
div
|
||||||
|
dl
|
||||||
|
dt
|
||||||
|
embed
|
||||||
|
fieldset
|
||||||
|
form
|
||||||
|
frame
|
||||||
|
frameset
|
||||||
|
h1
|
||||||
|
h2
|
||||||
|
h3
|
||||||
|
h4
|
||||||
|
h5
|
||||||
|
h6
|
||||||
|
head
|
||||||
|
hr
|
||||||
|
iframe
|
||||||
|
image
|
||||||
|
img
|
||||||
|
input
|
||||||
|
isindex
|
||||||
|
li
|
||||||
|
link
|
||||||
|
listing
|
||||||
|
menu
|
||||||
|
meta
|
||||||
|
noembed
|
||||||
|
noframes
|
||||||
|
noscript
|
||||||
|
ol
|
||||||
|
optgroup
|
||||||
|
option
|
||||||
|
p
|
||||||
|
param
|
||||||
|
plaintext
|
||||||
|
pre
|
||||||
|
script
|
||||||
|
select
|
||||||
|
spacer
|
||||||
|
style
|
||||||
|
tbody
|
||||||
|
textarea
|
||||||
|
tfoot
|
||||||
|
thead
|
||||||
|
title
|
||||||
|
tr
|
||||||
|
ul
|
||||||
|
wbr
|
||||||
|
]
|
||||||
|
|
||||||
|
SPACE_CHARACTERS = %W[
|
||||||
|
\t
|
||||||
|
\n
|
||||||
|
\x0B
|
||||||
|
\x0C
|
||||||
|
\x20
|
||||||
|
\r
|
||||||
|
]
|
||||||
|
|
||||||
|
TABLE_INSERT_MODE_ELEMENTS = %w[
|
||||||
|
table
|
||||||
|
tbody
|
||||||
|
tfoot
|
||||||
|
thead
|
||||||
|
tr
|
||||||
|
]
|
||||||
|
|
||||||
|
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
|
||||||
|
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
|
||||||
|
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
|
||||||
|
DIGITS = '0'..'9'
|
||||||
|
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
|
||||||
|
|
||||||
|
# Heading elements need to be ordered
|
||||||
|
HEADING_ELEMENTS = %w[
|
||||||
|
h1
|
||||||
|
h2
|
||||||
|
h3
|
||||||
|
h4
|
||||||
|
h5
|
||||||
|
h6
|
||||||
|
]
|
||||||
|
|
||||||
|
# XXX What about event-source and command?
|
||||||
|
VOID_ELEMENTS = %w[
|
||||||
|
base
|
||||||
|
link
|
||||||
|
meta
|
||||||
|
hr
|
||||||
|
br
|
||||||
|
img
|
||||||
|
embed
|
||||||
|
param
|
||||||
|
area
|
||||||
|
col
|
||||||
|
input
|
||||||
|
]
|
||||||
|
|
||||||
|
CDATA_ELEMENTS = %w[title textarea]
|
||||||
|
|
||||||
|
RCDATA_ELEMENTS = %w[
|
||||||
|
style
|
||||||
|
script
|
||||||
|
xmp
|
||||||
|
iframe
|
||||||
|
noembed
|
||||||
|
noframes
|
||||||
|
noscript
|
||||||
|
]
|
||||||
|
|
||||||
|
BOOLEAN_ATTRIBUTES = {
|
||||||
|
:global => %w[irrelevant],
|
||||||
|
'style' => %w[scoped],
|
||||||
|
'img' => %w[ismap],
|
||||||
|
'audio' => %w[autoplay controls],
|
||||||
|
'video' => %w[autoplay controls],
|
||||||
|
'script' => %w[defer async],
|
||||||
|
'details' => %w[open],
|
||||||
|
'datagrid' => %w[multiple disabled],
|
||||||
|
'command' => %w[hidden disabled checked default],
|
||||||
|
'menu' => %w[autosubmit],
|
||||||
|
'fieldset' => %w[disabled readonly],
|
||||||
|
'option' => %w[disabled readonly selected],
|
||||||
|
'optgroup' => %w[disabled readonly],
|
||||||
|
'button' => %w[disabled autofocus],
|
||||||
|
'input' => %w[disabled readonly required autofocus checked ismap],
|
||||||
|
'select' => %w[disabled readonly autofocus multiple],
|
||||||
|
'output' => %w[disabled readonly]
|
||||||
|
}
|
||||||
|
|
||||||
|
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
|
||||||
|
ENTITIES_WINDOWS1252 = [
|
||||||
|
8364, # 0x80 0x20AC EURO SIGN
|
||||||
|
65533, # 0x81 UNDEFINED
|
||||||
|
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
||||||
|
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
||||||
|
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
||||||
|
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
||||||
|
8224, # 0x86 0x2020 DAGGER
|
||||||
|
8225, # 0x87 0x2021 DOUBLE DAGGER
|
||||||
|
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||||
|
8240, # 0x89 0x2030 PER MILLE SIGN
|
||||||
|
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
||||||
|
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||||
|
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
||||||
|
65533, # 0x8D UNDEFINED
|
||||||
|
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
||||||
|
65533, # 0x8F UNDEFINED
|
||||||
|
65533, # 0x90 UNDEFINED
|
||||||
|
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
||||||
|
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
||||||
|
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
||||||
|
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
||||||
|
8226, # 0x95 0x2022 BULLET
|
||||||
|
8211, # 0x96 0x2013 EN DASH
|
||||||
|
8212, # 0x97 0x2014 EM DASH
|
||||||
|
732, # 0x98 0x02DC SMALL TILDE
|
||||||
|
8482, # 0x99 0x2122 TRADE MARK SIGN
|
||||||
|
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
||||||
|
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||||
|
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
||||||
|
65533, # 0x9D UNDEFINED
|
||||||
|
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
||||||
|
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||||
|
]
|
||||||
|
|
||||||
|
# ENTITIES was generated from Python using the following code:
|
||||||
|
#
|
||||||
|
# import constants
|
||||||
|
# entities = constants.entities.items()
|
||||||
|
# entities.sort()
|
||||||
|
# list = [ ' '.join([repr(entity), '=>', ord(value)<128 and
|
||||||
|
# repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
|
||||||
|
# for entity, value in entities]
|
||||||
|
# print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }'
|
||||||
|
|
||||||
|
ENTITIES = {
|
||||||
|
'AElig' => "\xc3\x86",
|
||||||
|
'AElig;' => "\xc3\x86",
|
||||||
|
'AMP' => '&',
|
||||||
|
'AMP;' => '&',
|
||||||
|
'Aacute' => "\xc3\x81",
|
||||||
|
'Aacute;' => "\xc3\x81",
|
||||||
|
'Acirc' => "\xc3\x82",
|
||||||
|
'Acirc;' => "\xc3\x82",
|
||||||
|
'Agrave' => "\xc3\x80",
|
||||||
|
'Agrave;' => "\xc3\x80",
|
||||||
|
'Alpha;' => "\xce\x91",
|
||||||
|
'Aring' => "\xc3\x85",
|
||||||
|
'Aring;' => "\xc3\x85",
|
||||||
|
'Atilde' => "\xc3\x83",
|
||||||
|
'Atilde;' => "\xc3\x83",
|
||||||
|
'Auml' => "\xc3\x84",
|
||||||
|
'Auml;' => "\xc3\x84",
|
||||||
|
'Beta;' => "\xce\x92",
|
||||||
|
'COPY' => "\xc2\xa9",
|
||||||
|
'COPY;' => "\xc2\xa9",
|
||||||
|
'Ccedil' => "\xc3\x87",
|
||||||
|
'Ccedil;' => "\xc3\x87",
|
||||||
|
'Chi;' => "\xce\xa7",
|
||||||
|
'Dagger;' => "\xe2\x80\xa1",
|
||||||
|
'Delta;' => "\xce\x94",
|
||||||
|
'ETH' => "\xc3\x90",
|
||||||
|
'ETH;' => "\xc3\x90",
|
||||||
|
'Eacute' => "\xc3\x89",
|
||||||
|
'Eacute;' => "\xc3\x89",
|
||||||
|
'Ecirc' => "\xc3\x8a",
|
||||||
|
'Ecirc;' => "\xc3\x8a",
|
||||||
|
'Egrave' => "\xc3\x88",
|
||||||
|
'Egrave;' => "\xc3\x88",
|
||||||
|
'Epsilon;' => "\xce\x95",
|
||||||
|
'Eta;' => "\xce\x97",
|
||||||
|
'Euml' => "\xc3\x8b",
|
||||||
|
'Euml;' => "\xc3\x8b",
|
||||||
|
'GT' => '>',
|
||||||
|
'GT;' => '>',
|
||||||
|
'Gamma;' => "\xce\x93",
|
||||||
|
'Iacute' => "\xc3\x8d",
|
||||||
|
'Iacute;' => "\xc3\x8d",
|
||||||
|
'Icirc' => "\xc3\x8e",
|
||||||
|
'Icirc;' => "\xc3\x8e",
|
||||||
|
'Igrave' => "\xc3\x8c",
|
||||||
|
'Igrave;' => "\xc3\x8c",
|
||||||
|
'Iota;' => "\xce\x99",
|
||||||
|
'Iuml' => "\xc3\x8f",
|
||||||
|
'Iuml;' => "\xc3\x8f",
|
||||||
|
'Kappa;' => "\xce\x9a",
|
||||||
|
'LT' => '<',
|
||||||
|
'LT;' => '<',
|
||||||
|
'Lambda;' => "\xce\x9b",
|
||||||
|
'Mu;' => "\xce\x9c",
|
||||||
|
'Ntilde' => "\xc3\x91",
|
||||||
|
'Ntilde;' => "\xc3\x91",
|
||||||
|
'Nu;' => "\xce\x9d",
|
||||||
|
'OElig;' => "\xc5\x92",
|
||||||
|
'Oacute' => "\xc3\x93",
|
||||||
|
'Oacute;' => "\xc3\x93",
|
||||||
|
'Ocirc' => "\xc3\x94",
|
||||||
|
'Ocirc;' => "\xc3\x94",
|
||||||
|
'Ograve' => "\xc3\x92",
|
||||||
|
'Ograve;' => "\xc3\x92",
|
||||||
|
'Omega;' => "\xce\xa9",
|
||||||
|
'Omicron;' => "\xce\x9f",
|
||||||
|
'Oslash' => "\xc3\x98",
|
||||||
|
'Oslash;' => "\xc3\x98",
|
||||||
|
'Otilde' => "\xc3\x95",
|
||||||
|
'Otilde;' => "\xc3\x95",
|
||||||
|
'Ouml' => "\xc3\x96",
|
||||||
|
'Ouml;' => "\xc3\x96",
|
||||||
|
'Phi;' => "\xce\xa6",
|
||||||
|
'Pi;' => "\xce\xa0",
|
||||||
|
'Prime;' => "\xe2\x80\xb3",
|
||||||
|
'Psi;' => "\xce\xa8",
|
||||||
|
'QUOT' => '"',
|
||||||
|
'QUOT;' => '"',
|
||||||
|
'REG' => "\xc2\xae",
|
||||||
|
'REG;' => "\xc2\xae",
|
||||||
|
'Rho;' => "\xce\xa1",
|
||||||
|
'Scaron;' => "\xc5\xa0",
|
||||||
|
'Sigma;' => "\xce\xa3",
|
||||||
|
'THORN' => "\xc3\x9e",
|
||||||
|
'THORN;' => "\xc3\x9e",
|
||||||
|
'TRADE;' => "\xe2\x84\xa2",
|
||||||
|
'Tau;' => "\xce\xa4",
|
||||||
|
'Theta;' => "\xce\x98",
|
||||||
|
'Uacute' => "\xc3\x9a",
|
||||||
|
'Uacute;' => "\xc3\x9a",
|
||||||
|
'Ucirc' => "\xc3\x9b",
|
||||||
|
'Ucirc;' => "\xc3\x9b",
|
||||||
|
'Ugrave' => "\xc3\x99",
|
||||||
|
'Ugrave;' => "\xc3\x99",
|
||||||
|
'Upsilon;' => "\xce\xa5",
|
||||||
|
'Uuml' => "\xc3\x9c",
|
||||||
|
'Uuml;' => "\xc3\x9c",
|
||||||
|
'Xi;' => "\xce\x9e",
|
||||||
|
'Yacute' => "\xc3\x9d",
|
||||||
|
'Yacute;' => "\xc3\x9d",
|
||||||
|
'Yuml;' => "\xc5\xb8",
|
||||||
|
'Zeta;' => "\xce\x96",
|
||||||
|
'aacute' => "\xc3\xa1",
|
||||||
|
'aacute;' => "\xc3\xa1",
|
||||||
|
'acirc' => "\xc3\xa2",
|
||||||
|
'acirc;' => "\xc3\xa2",
|
||||||
|
'acute' => "\xc2\xb4",
|
||||||
|
'acute;' => "\xc2\xb4",
|
||||||
|
'aelig' => "\xc3\xa6",
|
||||||
|
'aelig;' => "\xc3\xa6",
|
||||||
|
'agrave' => "\xc3\xa0",
|
||||||
|
'agrave;' => "\xc3\xa0",
|
||||||
|
'alefsym;' => "\xe2\x84\xb5",
|
||||||
|
'alpha;' => "\xce\xb1",
|
||||||
|
'amp' => '&',
|
||||||
|
'amp;' => '&',
|
||||||
|
'and;' => "\xe2\x88\xa7",
|
||||||
|
'ang;' => "\xe2\x88\xa0",
|
||||||
|
'apos;' => "'",
|
||||||
|
'aring' => "\xc3\xa5",
|
||||||
|
'aring;' => "\xc3\xa5",
|
||||||
|
'asymp;' => "\xe2\x89\x88",
|
||||||
|
'atilde' => "\xc3\xa3",
|
||||||
|
'atilde;' => "\xc3\xa3",
|
||||||
|
'auml' => "\xc3\xa4",
|
||||||
|
'auml;' => "\xc3\xa4",
|
||||||
|
'bdquo;' => "\xe2\x80\x9e",
|
||||||
|
'beta;' => "\xce\xb2",
|
||||||
|
'brvbar' => "\xc2\xa6",
|
||||||
|
'brvbar;' => "\xc2\xa6",
|
||||||
|
'bull;' => "\xe2\x80\xa2",
|
||||||
|
'cap;' => "\xe2\x88\xa9",
|
||||||
|
'ccedil' => "\xc3\xa7",
|
||||||
|
'ccedil;' => "\xc3\xa7",
|
||||||
|
'cedil' => "\xc2\xb8",
|
||||||
|
'cedil;' => "\xc2\xb8",
|
||||||
|
'cent' => "\xc2\xa2",
|
||||||
|
'cent;' => "\xc2\xa2",
|
||||||
|
'chi;' => "\xcf\x87",
|
||||||
|
'circ;' => "\xcb\x86",
|
||||||
|
'clubs;' => "\xe2\x99\xa3",
|
||||||
|
'cong;' => "\xe2\x89\x85",
|
||||||
|
'copy' => "\xc2\xa9",
|
||||||
|
'copy;' => "\xc2\xa9",
|
||||||
|
'crarr;' => "\xe2\x86\xb5",
|
||||||
|
'cup;' => "\xe2\x88\xaa",
|
||||||
|
'curren' => "\xc2\xa4",
|
||||||
|
'curren;' => "\xc2\xa4",
|
||||||
|
'dArr;' => "\xe2\x87\x93",
|
||||||
|
'dagger;' => "\xe2\x80\xa0",
|
||||||
|
'darr;' => "\xe2\x86\x93",
|
||||||
|
'deg' => "\xc2\xb0",
|
||||||
|
'deg;' => "\xc2\xb0",
|
||||||
|
'delta;' => "\xce\xb4",
|
||||||
|
'diams;' => "\xe2\x99\xa6",
|
||||||
|
'divide' => "\xc3\xb7",
|
||||||
|
'divide;' => "\xc3\xb7",
|
||||||
|
'eacute' => "\xc3\xa9",
|
||||||
|
'eacute;' => "\xc3\xa9",
|
||||||
|
'ecirc' => "\xc3\xaa",
|
||||||
|
'ecirc;' => "\xc3\xaa",
|
||||||
|
'egrave' => "\xc3\xa8",
|
||||||
|
'egrave;' => "\xc3\xa8",
|
||||||
|
'empty;' => "\xe2\x88\x85",
|
||||||
|
'emsp;' => "\xe2\x80\x83",
|
||||||
|
'ensp;' => "\xe2\x80\x82",
|
||||||
|
'epsilon;' => "\xce\xb5",
|
||||||
|
'equiv;' => "\xe2\x89\xa1",
|
||||||
|
'eta;' => "\xce\xb7",
|
||||||
|
'eth' => "\xc3\xb0",
|
||||||
|
'eth;' => "\xc3\xb0",
|
||||||
|
'euml' => "\xc3\xab",
|
||||||
|
'euml;' => "\xc3\xab",
|
||||||
|
'euro;' => "\xe2\x82\xac",
|
||||||
|
'exist;' => "\xe2\x88\x83",
|
||||||
|
'fnof;' => "\xc6\x92",
|
||||||
|
'forall;' => "\xe2\x88\x80",
|
||||||
|
'frac12' => "\xc2\xbd",
|
||||||
|
'frac12;' => "\xc2\xbd",
|
||||||
|
'frac14' => "\xc2\xbc",
|
||||||
|
'frac14;' => "\xc2\xbc",
|
||||||
|
'frac34' => "\xc2\xbe",
|
||||||
|
'frac34;' => "\xc2\xbe",
|
||||||
|
'frasl;' => "\xe2\x81\x84",
|
||||||
|
'gamma;' => "\xce\xb3",
|
||||||
|
'ge;' => "\xe2\x89\xa5",
|
||||||
|
'gt' => '>',
|
||||||
|
'gt;' => '>',
|
||||||
|
'hArr;' => "\xe2\x87\x94",
|
||||||
|
'harr;' => "\xe2\x86\x94",
|
||||||
|
'hearts;' => "\xe2\x99\xa5",
|
||||||
|
'hellip;' => "\xe2\x80\xa6",
|
||||||
|
'iacute' => "\xc3\xad",
|
||||||
|
'iacute;' => "\xc3\xad",
|
||||||
|
'icirc' => "\xc3\xae",
|
||||||
|
'icirc;' => "\xc3\xae",
|
||||||
|
'iexcl' => "\xc2\xa1",
|
||||||
|
'iexcl;' => "\xc2\xa1",
|
||||||
|
'igrave' => "\xc3\xac",
|
||||||
|
'igrave;' => "\xc3\xac",
|
||||||
|
'image;' => "\xe2\x84\x91",
|
||||||
|
'infin;' => "\xe2\x88\x9e",
|
||||||
|
'int;' => "\xe2\x88\xab",
|
||||||
|
'iota;' => "\xce\xb9",
|
||||||
|
'iquest' => "\xc2\xbf",
|
||||||
|
'iquest;' => "\xc2\xbf",
|
||||||
|
'isin;' => "\xe2\x88\x88",
|
||||||
|
'iuml' => "\xc3\xaf",
|
||||||
|
'iuml;' => "\xc3\xaf",
|
||||||
|
'kappa;' => "\xce\xba",
|
||||||
|
'lArr;' => "\xe2\x87\x90",
|
||||||
|
'lambda;' => "\xce\xbb",
|
||||||
|
'lang;' => "\xe3\x80\x88",
|
||||||
|
'laquo' => "\xc2\xab",
|
||||||
|
'laquo;' => "\xc2\xab",
|
||||||
|
'larr;' => "\xe2\x86\x90",
|
||||||
|
'lceil;' => "\xe2\x8c\x88",
|
||||||
|
'ldquo;' => "\xe2\x80\x9c",
|
||||||
|
'le;' => "\xe2\x89\xa4",
|
||||||
|
'lfloor;' => "\xe2\x8c\x8a",
|
||||||
|
'lowast;' => "\xe2\x88\x97",
|
||||||
|
'loz;' => "\xe2\x97\x8a",
|
||||||
|
'lrm;' => "\xe2\x80\x8e",
|
||||||
|
'lsaquo;' => "\xe2\x80\xb9",
|
||||||
|
'lsquo;' => "\xe2\x80\x98",
|
||||||
|
'lt' => '<',
|
||||||
|
'lt;' => '<',
|
||||||
|
'macr' => "\xc2\xaf",
|
||||||
|
'macr;' => "\xc2\xaf",
|
||||||
|
'mdash;' => "\xe2\x80\x94",
|
||||||
|
'micro' => "\xc2\xb5",
|
||||||
|
'micro;' => "\xc2\xb5",
|
||||||
|
'middot' => "\xc2\xb7",
|
||||||
|
'middot;' => "\xc2\xb7",
|
||||||
|
'minus;' => "\xe2\x88\x92",
|
||||||
|
'mu;' => "\xce\xbc",
|
||||||
|
'nabla;' => "\xe2\x88\x87",
|
||||||
|
'nbsp' => "\xc2\xa0",
|
||||||
|
'nbsp;' => "\xc2\xa0",
|
||||||
|
'ndash;' => "\xe2\x80\x93",
|
||||||
|
'ne;' => "\xe2\x89\xa0",
|
||||||
|
'ni;' => "\xe2\x88\x8b",
|
||||||
|
'not' => "\xc2\xac",
|
||||||
|
'not;' => "\xc2\xac",
|
||||||
|
'notin;' => "\xe2\x88\x89",
|
||||||
|
'nsub;' => "\xe2\x8a\x84",
|
||||||
|
'ntilde' => "\xc3\xb1",
|
||||||
|
'ntilde;' => "\xc3\xb1",
|
||||||
|
'nu;' => "\xce\xbd",
|
||||||
|
'oacute' => "\xc3\xb3",
|
||||||
|
'oacute;' => "\xc3\xb3",
|
||||||
|
'ocirc' => "\xc3\xb4",
|
||||||
|
'ocirc;' => "\xc3\xb4",
|
||||||
|
'oelig;' => "\xc5\x93",
|
||||||
|
'ograve' => "\xc3\xb2",
|
||||||
|
'ograve;' => "\xc3\xb2",
|
||||||
|
'oline;' => "\xe2\x80\xbe",
|
||||||
|
'omega;' => "\xcf\x89",
|
||||||
|
'omicron;' => "\xce\xbf",
|
||||||
|
'oplus;' => "\xe2\x8a\x95",
|
||||||
|
'or;' => "\xe2\x88\xa8",
|
||||||
|
'ordf' => "\xc2\xaa",
|
||||||
|
'ordf;' => "\xc2\xaa",
|
||||||
|
'ordm' => "\xc2\xba",
|
||||||
|
'ordm;' => "\xc2\xba",
|
||||||
|
'oslash' => "\xc3\xb8",
|
||||||
|
'oslash;' => "\xc3\xb8",
|
||||||
|
'otilde' => "\xc3\xb5",
|
||||||
|
'otilde;' => "\xc3\xb5",
|
||||||
|
'otimes;' => "\xe2\x8a\x97",
|
||||||
|
'ouml' => "\xc3\xb6",
|
||||||
|
'ouml;' => "\xc3\xb6",
|
||||||
|
'para' => "\xc2\xb6",
|
||||||
|
'para;' => "\xc2\xb6",
|
||||||
|
'part;' => "\xe2\x88\x82",
|
||||||
|
'permil;' => "\xe2\x80\xb0",
|
||||||
|
'perp;' => "\xe2\x8a\xa5",
|
||||||
|
'phi;' => "\xcf\x86",
|
||||||
|
'pi;' => "\xcf\x80",
|
||||||
|
'piv;' => "\xcf\x96",
|
||||||
|
'plusmn' => "\xc2\xb1",
|
||||||
|
'plusmn;' => "\xc2\xb1",
|
||||||
|
'pound' => "\xc2\xa3",
|
||||||
|
'pound;' => "\xc2\xa3",
|
||||||
|
'prime;' => "\xe2\x80\xb2",
|
||||||
|
'prod;' => "\xe2\x88\x8f",
|
||||||
|
'prop;' => "\xe2\x88\x9d",
|
||||||
|
'psi;' => "\xcf\x88",
|
||||||
|
'quot' => '"',
|
||||||
|
'quot;' => '"',
|
||||||
|
'rArr;' => "\xe2\x87\x92",
|
||||||
|
'radic;' => "\xe2\x88\x9a",
|
||||||
|
'rang;' => "\xe3\x80\x89",
|
||||||
|
'raquo' => "\xc2\xbb",
|
||||||
|
'raquo;' => "\xc2\xbb",
|
||||||
|
'rarr;' => "\xe2\x86\x92",
|
||||||
|
'rceil;' => "\xe2\x8c\x89",
|
||||||
|
'rdquo;' => "\xe2\x80\x9d",
|
||||||
|
'real;' => "\xe2\x84\x9c",
|
||||||
|
'reg' => "\xc2\xae",
|
||||||
|
'reg;' => "\xc2\xae",
|
||||||
|
'rfloor;' => "\xe2\x8c\x8b",
|
||||||
|
'rho;' => "\xcf\x81",
|
||||||
|
'rlm;' => "\xe2\x80\x8f",
|
||||||
|
'rsaquo;' => "\xe2\x80\xba",
|
||||||
|
'rsquo;' => "\xe2\x80\x99",
|
||||||
|
'sbquo;' => "\xe2\x80\x9a",
|
||||||
|
'scaron;' => "\xc5\xa1",
|
||||||
|
'sdot;' => "\xe2\x8b\x85",
|
||||||
|
'sect' => "\xc2\xa7",
|
||||||
|
'sect;' => "\xc2\xa7",
|
||||||
|
'shy' => "\xc2\xad",
|
||||||
|
'shy;' => "\xc2\xad",
|
||||||
|
'sigma;' => "\xcf\x83",
|
||||||
|
'sigmaf;' => "\xcf\x82",
|
||||||
|
'sim;' => "\xe2\x88\xbc",
|
||||||
|
'spades;' => "\xe2\x99\xa0",
|
||||||
|
'sub;' => "\xe2\x8a\x82",
|
||||||
|
'sube;' => "\xe2\x8a\x86",
|
||||||
|
'sum;' => "\xe2\x88\x91",
|
||||||
|
'sup1' => "\xc2\xb9",
|
||||||
|
'sup1;' => "\xc2\xb9",
|
||||||
|
'sup2' => "\xc2\xb2",
|
||||||
|
'sup2;' => "\xc2\xb2",
|
||||||
|
'sup3' => "\xc2\xb3",
|
||||||
|
'sup3;' => "\xc2\xb3",
|
||||||
|
'sup;' => "\xe2\x8a\x83",
|
||||||
|
'supe;' => "\xe2\x8a\x87",
|
||||||
|
'szlig' => "\xc3\x9f",
|
||||||
|
'szlig;' => "\xc3\x9f",
|
||||||
|
'tau;' => "\xcf\x84",
|
||||||
|
'there4;' => "\xe2\x88\xb4",
|
||||||
|
'theta;' => "\xce\xb8",
|
||||||
|
'thetasym;' => "\xcf\x91",
|
||||||
|
'thinsp;' => "\xe2\x80\x89",
|
||||||
|
'thorn' => "\xc3\xbe",
|
||||||
|
'thorn;' => "\xc3\xbe",
|
||||||
|
'tilde;' => "\xcb\x9c",
|
||||||
|
'times' => "\xc3\x97",
|
||||||
|
'times;' => "\xc3\x97",
|
||||||
|
'trade;' => "\xe2\x84\xa2",
|
||||||
|
'uArr;' => "\xe2\x87\x91",
|
||||||
|
'uacute' => "\xc3\xba",
|
||||||
|
'uacute;' => "\xc3\xba",
|
||||||
|
'uarr;' => "\xe2\x86\x91",
|
||||||
|
'ucirc' => "\xc3\xbb",
|
||||||
|
'ucirc;' => "\xc3\xbb",
|
||||||
|
'ugrave' => "\xc3\xb9",
|
||||||
|
'ugrave;' => "\xc3\xb9",
|
||||||
|
'uml' => "\xc2\xa8",
|
||||||
|
'uml;' => "\xc2\xa8",
|
||||||
|
'upsih;' => "\xcf\x92",
|
||||||
|
'upsilon;' => "\xcf\x85",
|
||||||
|
'uuml' => "\xc3\xbc",
|
||||||
|
'uuml;' => "\xc3\xbc",
|
||||||
|
'weierp;' => "\xe2\x84\x98",
|
||||||
|
'xi;' => "\xce\xbe",
|
||||||
|
'yacute' => "\xc3\xbd",
|
||||||
|
'yacute;' => "\xc3\xbd",
|
||||||
|
'yen' => "\xc2\xa5",
|
||||||
|
'yen;' => "\xc2\xa5",
|
||||||
|
'yuml' => "\xc3\xbf",
|
||||||
|
'yuml;' => "\xc3\xbf",
|
||||||
|
'zeta;' => "\xce\xb6",
|
||||||
|
'zwj;' => "\xe2\x80\x8d",
|
||||||
|
'zwnj;' => "\xe2\x80\x8c"
|
||||||
|
}
|
||||||
|
|
||||||
|
ENCODINGS = %w[
|
||||||
|
ansi_x3.4-1968
|
||||||
|
iso-ir-6
|
||||||
|
ansi_x3.4-1986
|
||||||
|
iso_646.irv:1991
|
||||||
|
ascii
|
||||||
|
iso646-us
|
||||||
|
us-ascii
|
||||||
|
us
|
||||||
|
ibm367
|
||||||
|
cp367
|
||||||
|
csascii
|
||||||
|
ks_c_5601-1987
|
||||||
|
korean
|
||||||
|
iso-2022-kr
|
||||||
|
csiso2022kr
|
||||||
|
euc-kr
|
||||||
|
iso-2022-jp
|
||||||
|
csiso2022jp
|
||||||
|
iso-2022-jp-2
|
||||||
|
iso-ir-58
|
||||||
|
chinese
|
||||||
|
csiso58gb231280
|
||||||
|
iso_8859-1:1987
|
||||||
|
iso-ir-100
|
||||||
|
iso_8859-1
|
||||||
|
iso-8859-1
|
||||||
|
latin1
|
||||||
|
l1
|
||||||
|
ibm819
|
||||||
|
cp819
|
||||||
|
csisolatin1
|
||||||
|
iso_8859-2:1987
|
||||||
|
iso-ir-101
|
||||||
|
iso_8859-2
|
||||||
|
iso-8859-2
|
||||||
|
latin2
|
||||||
|
l2
|
||||||
|
csisolatin2
|
||||||
|
iso_8859-3:1988
|
||||||
|
iso-ir-109
|
||||||
|
iso_8859-3
|
||||||
|
iso-8859-3
|
||||||
|
latin3
|
||||||
|
l3
|
||||||
|
csisolatin3
|
||||||
|
iso_8859-4:1988
|
||||||
|
iso-ir-110
|
||||||
|
iso_8859-4
|
||||||
|
iso-8859-4
|
||||||
|
latin4
|
||||||
|
l4
|
||||||
|
csisolatin4
|
||||||
|
iso_8859-6:1987
|
||||||
|
iso-ir-127
|
||||||
|
iso_8859-6
|
||||||
|
iso-8859-6
|
||||||
|
ecma-114
|
||||||
|
asmo-708
|
||||||
|
arabic
|
||||||
|
csisolatinarabic
|
||||||
|
iso_8859-7:1987
|
||||||
|
iso-ir-126
|
||||||
|
iso_8859-7
|
||||||
|
iso-8859-7
|
||||||
|
elot_928
|
||||||
|
ecma-118
|
||||||
|
greek
|
||||||
|
greek8
|
||||||
|
csisolatingreek
|
||||||
|
iso_8859-8:1988
|
||||||
|
iso-ir-138
|
||||||
|
iso_8859-8
|
||||||
|
iso-8859-8
|
||||||
|
hebrew
|
||||||
|
csisolatinhebrew
|
||||||
|
iso_8859-5:1988
|
||||||
|
iso-ir-144
|
||||||
|
iso_8859-5
|
||||||
|
iso-8859-5
|
||||||
|
cyrillic
|
||||||
|
csisolatincyrillic
|
||||||
|
iso_8859-9:1989
|
||||||
|
iso-ir-148
|
||||||
|
iso_8859-9
|
||||||
|
iso-8859-9
|
||||||
|
latin5
|
||||||
|
l5
|
||||||
|
csisolatin5
|
||||||
|
iso-8859-10
|
||||||
|
iso-ir-157
|
||||||
|
l6
|
||||||
|
iso_8859-10:1992
|
||||||
|
csisolatin6
|
||||||
|
latin6
|
||||||
|
hp-roman8
|
||||||
|
roman8
|
||||||
|
r8
|
||||||
|
ibm037
|
||||||
|
cp037
|
||||||
|
csibm037
|
||||||
|
ibm424
|
||||||
|
cp424
|
||||||
|
csibm424
|
||||||
|
ibm437
|
||||||
|
cp437
|
||||||
|
437
|
||||||
|
cspc8codepage437
|
||||||
|
ibm500
|
||||||
|
cp500
|
||||||
|
csibm500
|
||||||
|
ibm775
|
||||||
|
cp775
|
||||||
|
cspc775baltic
|
||||||
|
ibm850
|
||||||
|
cp850
|
||||||
|
850
|
||||||
|
cspc850multilingual
|
||||||
|
ibm852
|
||||||
|
cp852
|
||||||
|
852
|
||||||
|
cspcp852
|
||||||
|
ibm855
|
||||||
|
cp855
|
||||||
|
855
|
||||||
|
csibm855
|
||||||
|
ibm857
|
||||||
|
cp857
|
||||||
|
857
|
||||||
|
csibm857
|
||||||
|
ibm860
|
||||||
|
cp860
|
||||||
|
860
|
||||||
|
csibm860
|
||||||
|
ibm861
|
||||||
|
cp861
|
||||||
|
861
|
||||||
|
cp-is
|
||||||
|
csibm861
|
||||||
|
ibm862
|
||||||
|
cp862
|
||||||
|
862
|
||||||
|
cspc862latinhebrew
|
||||||
|
ibm863
|
||||||
|
cp863
|
||||||
|
863
|
||||||
|
csibm863
|
||||||
|
ibm864
|
||||||
|
cp864
|
||||||
|
csibm864
|
||||||
|
ibm865
|
||||||
|
cp865
|
||||||
|
865
|
||||||
|
csibm865
|
||||||
|
ibm866
|
||||||
|
cp866
|
||||||
|
866
|
||||||
|
csibm866
|
||||||
|
ibm869
|
||||||
|
cp869
|
||||||
|
869
|
||||||
|
cp-gr
|
||||||
|
csibm869
|
||||||
|
ibm1026
|
||||||
|
cp1026
|
||||||
|
csibm1026
|
||||||
|
koi8-r
|
||||||
|
cskoi8r
|
||||||
|
koi8-u
|
||||||
|
big5-hkscs
|
||||||
|
ptcp154
|
||||||
|
csptcp154
|
||||||
|
pt154
|
||||||
|
cp154
|
||||||
|
utf-7
|
||||||
|
utf-16be
|
||||||
|
utf-16le
|
||||||
|
utf-16
|
||||||
|
utf-8
|
||||||
|
iso-8859-13
|
||||||
|
iso-8859-14
|
||||||
|
iso-ir-199
|
||||||
|
iso_8859-14:1998
|
||||||
|
iso_8859-14
|
||||||
|
latin8
|
||||||
|
iso-celtic
|
||||||
|
l8
|
||||||
|
iso-8859-15
|
||||||
|
iso_8859-15
|
||||||
|
iso-8859-16
|
||||||
|
iso-ir-226
|
||||||
|
iso_8859-16:2001
|
||||||
|
iso_8859-16
|
||||||
|
latin10
|
||||||
|
l10
|
||||||
|
gbk
|
||||||
|
cp936
|
||||||
|
ms936
|
||||||
|
gb18030
|
||||||
|
shift_jis
|
||||||
|
ms_kanji
|
||||||
|
csshiftjis
|
||||||
|
euc-jp
|
||||||
|
gb2312
|
||||||
|
big5
|
||||||
|
csbig5
|
||||||
|
windows-1250
|
||||||
|
windows-1251
|
||||||
|
windows-1252
|
||||||
|
windows-1253
|
||||||
|
windows-1254
|
||||||
|
windows-1255
|
||||||
|
windows-1256
|
||||||
|
windows-1257
|
||||||
|
windows-1258
|
||||||
|
tis-620
|
||||||
|
hz-gb-2312
|
||||||
|
]
|
||||||
|
|
||||||
|
end
|
1
vendor/plugins/HTML5lib/lib/html5/filters.rb
vendored
Normal file
1
vendor/plugins/HTML5lib/lib/html5/filters.rb
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
require 'html5/filters/optionaltags'
|
|
@ -1,7 +1,7 @@
|
||||||
require 'delegate'
|
require 'delegate'
|
||||||
require 'enumerator'
|
require 'enumerator'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module Filters
|
module Filters
|
||||||
class Base < SimpleDelegator
|
class Base < SimpleDelegator
|
||||||
include Enumerable
|
include Enumerable
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/filters/base'
|
require 'html5/filters/base'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module Filters
|
module Filters
|
||||||
class InjectMetaCharset < Base
|
class InjectMetaCharset < Base
|
||||||
def initialize(source, encoding)
|
def initialize(source, encoding)
|
|
@ -1,7 +1,7 @@
|
||||||
require 'html5lib/constants'
|
require 'html5/constants'
|
||||||
require 'html5lib/filters/base'
|
require 'html5/filters/base'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module Filters
|
module Filters
|
||||||
|
|
||||||
class OptionalTagFilter < Base
|
class OptionalTagFilter < Base
|
|
@ -1,7 +1,7 @@
|
||||||
require 'html5lib/filters/base'
|
require 'html5/filters/base'
|
||||||
require 'html5lib/sanitizer'
|
require 'html5/sanitizer'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module Filters
|
module Filters
|
||||||
class HTMLSanitizeFilter < Base
|
class HTMLSanitizeFilter < Base
|
||||||
include HTMLSanitizeModule
|
include HTMLSanitizeModule
|
|
@ -1,7 +1,7 @@
|
||||||
require 'html5lib/constants'
|
require 'html5/constants'
|
||||||
require 'html5lib/filters/base'
|
require 'html5/filters/base'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module Filters
|
module Filters
|
||||||
class WhitespaceFilter < Base
|
class WhitespaceFilter < Base
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
require 'html5lib/constants'
|
require 'html5/constants'
|
||||||
require 'html5lib/tokenizer'
|
require 'html5/tokenizer'
|
||||||
require 'html5lib/treebuilders/rexml'
|
require 'html5/treebuilders/rexml'
|
||||||
|
|
||||||
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
|
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
|
||||||
require 'html5lib/html5parser/' + File.basename(path)
|
require 'html5/html5parser/' + File.basename(path)
|
||||||
end
|
end
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
|
|
||||||
# Error in parsed document
|
# Error in parsed document
|
||||||
class ParseError < Exception; end
|
class ParseError < Exception; end
|
||||||
|
@ -37,7 +37,7 @@ module HTML5lib
|
||||||
# :strict - raise an exception when a parse error is encountered
|
# :strict - raise an exception when a parse error is encountered
|
||||||
# :tree - a treebuilder class controlling the type of tree that will be
|
# :tree - a treebuilder class controlling the type of tree that will be
|
||||||
# returned. Built in treebuilders can be accessed through
|
# returned. Built in treebuilders can be accessed through
|
||||||
# HTML5lib::TreeBuilders[treeType]
|
# HTML5::TreeBuilders[treeType]
|
||||||
def initialize(options = {})
|
def initialize(options = {})
|
||||||
@strict = false
|
@strict = false
|
||||||
@errors = []
|
@errors = []
|
||||||
|
@ -51,7 +51,7 @@ module HTML5lib
|
||||||
|
|
||||||
@phases = @@phases.inject({}) do |phases, phase_name|
|
@phases = @@phases.inject({}) do |phases, phase_name|
|
||||||
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
|
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
|
||||||
phases[phase_name.to_sym] = HTML5lib.const_get(phase_class_name).new(self, @tree)
|
phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
|
||||||
phases
|
phases
|
||||||
end
|
end
|
||||||
end
|
end
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class AfterBodyPhase < Phase
|
class AfterBodyPhase < Phase
|
||||||
|
|
||||||
handle_end 'html'
|
handle_end 'html'
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class AfterFramesetPhase < Phase
|
class AfterFramesetPhase < Phase
|
||||||
|
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#after3
|
# http://www.whatwg.org/specs/web-apps/current-work/#after3
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class AfterHeadPhase < Phase
|
class AfterHeadPhase < Phase
|
||||||
|
|
||||||
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
|
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
|
|
@ -1,11 +1,11 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class BeforeHeadPhase < Phase
|
class BeforeHeadPhase < Phase
|
||||||
|
|
||||||
handle_start 'html', 'head'
|
handle_start 'html', 'head'
|
||||||
|
|
||||||
handle_end %w( html head body br ) => 'ImplyHead'
|
handle_end %w( html head body br p ) => 'ImplyHead'
|
||||||
|
|
||||||
def processEOF
|
def processEOF
|
||||||
startTagHead('head', {})
|
startTagHead('head', {})
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class InBodyPhase < Phase
|
class InBodyPhase < Phase
|
||||||
|
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
|
||||||
|
@ -112,7 +112,7 @@ module HTML5lib
|
||||||
|
|
||||||
def startTagForm(name, attributes)
|
def startTagForm(name, attributes)
|
||||||
if @tree.formPointer
|
if @tree.formPointer
|
||||||
@parser.parseError('Unexpected start tag (form). Ignored.')
|
@parser.parseError(_('Unexpected start tag (form). Ignored.'))
|
||||||
else
|
else
|
||||||
endTagP('p') if in_scope?('p')
|
endTagP('p') if in_scope?('p')
|
||||||
@tree.insertElement(name, attributes)
|
@tree.insertElement(name, attributes)
|
||||||
|
@ -129,9 +129,9 @@ module HTML5lib
|
||||||
if stopName.include?(node.name)
|
if stopName.include?(node.name)
|
||||||
poppedNodes = (0..i).collect { @tree.openElements.pop }
|
poppedNodes = (0..i).collect { @tree.openElements.pop }
|
||||||
if i >= 1
|
if i >= 1
|
||||||
@parser.parseError("Missing end tag%s (%s)" % [
|
@parser.parseError(_("Missing end tag%s (%s)" % [
|
||||||
(i>1 ? 's' : ''),
|
(i>1 ? 's' : ''),
|
||||||
poppedNodes.reverse.map {|item| item.name}.join(', ')])
|
poppedNodes.reverse.map {|item| item.name}.join(', ')]))
|
||||||
end
|
end
|
||||||
break
|
break
|
||||||
end
|
end
|
||||||
|
@ -251,7 +251,7 @@ module HTML5lib
|
||||||
end
|
end
|
||||||
|
|
||||||
def startTagIsindex(name, attributes)
|
def startTagIsindex(name, attributes)
|
||||||
@parser.parseError("Unexpected start tag isindex. Don't use it!")
|
@parser.parseError(_("Unexpected start tag isindex. Don't use it!"))
|
||||||
return if @tree.formPointer
|
return if @tree.formPointer
|
||||||
processStartTag('form', {})
|
processStartTag('form', {})
|
||||||
processStartTag('hr', {})
|
processStartTag('hr', {})
|
||||||
|
@ -311,8 +311,13 @@ module HTML5lib
|
||||||
|
|
||||||
def endTagP(name)
|
def endTagP(name)
|
||||||
@tree.generateImpliedEndTags('p') if in_scope?('p')
|
@tree.generateImpliedEndTags('p') if in_scope?('p')
|
||||||
@parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p'
|
@parser.parseError(_('Unexpected end tag (p).')) unless @tree.openElements[-1].name == 'p'
|
||||||
|
if in_scope?('p')
|
||||||
@tree.openElements.pop while in_scope?('p')
|
@tree.openElements.pop while in_scope?('p')
|
||||||
|
else
|
||||||
|
startTagCloseP('p', {})
|
||||||
|
endTagP('p')
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def endTagBody(name)
|
def endTagBody(name)
|
||||||
|
@ -342,7 +347,7 @@ module HTML5lib
|
||||||
@tree.generateImpliedEndTags if in_scope?(name)
|
@tree.generateImpliedEndTags if in_scope?(name)
|
||||||
|
|
||||||
unless @tree.openElements[-1].name == name
|
unless @tree.openElements[-1].name == name
|
||||||
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
|
@parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag."))
|
||||||
end
|
end
|
||||||
|
|
||||||
if in_scope?(name)
|
if in_scope?(name)
|
||||||
|
@ -351,7 +356,14 @@ module HTML5lib
|
||||||
end
|
end
|
||||||
|
|
||||||
def endTagForm(name)
|
def endTagForm(name)
|
||||||
endTagBlock(name)
|
if in_scope?(name)
|
||||||
|
@tree.generateImpliedEndTags
|
||||||
|
end
|
||||||
|
if @tree.openElements[-1].name != name
|
||||||
|
@parser.parseError(_("End tag (form) seen too early. Ignored."))
|
||||||
|
else
|
||||||
|
@tree.openElements.pop
|
||||||
|
end
|
||||||
@tree.formPointer = nil
|
@tree.formPointer = nil
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -361,7 +373,7 @@ module HTML5lib
|
||||||
@tree.generateImpliedEndTags(name)
|
@tree.generateImpliedEndTags(name)
|
||||||
|
|
||||||
unless @tree.openElements[-1].name == name
|
unless @tree.openElements[-1].name == name
|
||||||
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
|
@parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag."))
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -377,7 +389,7 @@ module HTML5lib
|
||||||
end
|
end
|
||||||
|
|
||||||
unless @tree.openElements[-1].name == name
|
unless @tree.openElements[-1].name == name
|
||||||
@parser.parseError(("Unexpected end tag (#{name}). Expected other end tag."))
|
@parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag."))
|
||||||
end
|
end
|
||||||
|
|
||||||
HEADING_ELEMENTS.each do |element|
|
HEADING_ELEMENTS.each do |element|
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class InCaptionPhase < Phase
|
class InCaptionPhase < Phase
|
||||||
|
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class InCellPhase < Phase
|
class InCellPhase < Phase
|
||||||
|
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class InColumnGroupPhase < Phase
|
class InColumnGroupPhase < Phase
|
||||||
|
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class InFramesetPhase < Phase
|
class InFramesetPhase < Phase
|
||||||
|
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
|
|
@ -1,12 +1,12 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class InHeadPhase < Phase
|
class InHeadPhase < Phase
|
||||||
|
|
||||||
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
|
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
|
||||||
|
|
||||||
handle_end 'head'
|
handle_end 'head'
|
||||||
handle_end %w( html body br ) => 'ImplyAfterHead'
|
handle_end %w( html body br p ) => 'ImplyAfterHead'
|
||||||
handle_end %w( title style script )
|
handle_end %w( title style script )
|
||||||
|
|
||||||
def processEOF
|
def processEOF
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class InRowPhase < Phase
|
class InRowPhase < Phase
|
||||||
|
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class InSelectPhase < Phase
|
class InSelectPhase < Phase
|
||||||
|
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class InTableBodyPhase < Phase
|
class InTableBodyPhase < Phase
|
||||||
|
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class InTablePhase < Phase
|
class InTablePhase < Phase
|
||||||
|
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class InitialPhase < Phase
|
class InitialPhase < Phase
|
||||||
|
|
||||||
# This phase deals with error handling as well which is currently not
|
# This phase deals with error handling as well which is currently not
|
|
@ -1,4 +1,4 @@
|
||||||
module HTML5lib
|
module HTML5
|
||||||
# Base class for helper objects that implement each phase of processing.
|
# Base class for helper objects that implement each phase of processing.
|
||||||
#
|
#
|
||||||
# Handler methods should be in the following order (they can be omitted):
|
# Handler methods should be in the following order (they can be omitted):
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class RootElementPhase < Phase
|
class RootElementPhase < Phase
|
||||||
|
|
||||||
def processEOF
|
def processEOF
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/html5parser/phase'
|
require 'html5/html5parser/phase'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
class TrailingEndPhase < Phase
|
class TrailingEndPhase < Phase
|
||||||
|
|
||||||
def processEOF
|
def processEOF
|
|
@ -1,7 +1,7 @@
|
||||||
require 'stringio'
|
require 'stringio'
|
||||||
require 'html5lib/constants'
|
require 'html5/constants'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
|
|
||||||
# Provides a unicode stream of characters to the HTMLTokenizer.
|
# Provides a unicode stream of characters to the HTMLTokenizer.
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ module HTML5lib
|
||||||
|
|
||||||
class HTMLInputStream
|
class HTMLInputStream
|
||||||
|
|
||||||
attr_accessor :queue, :char_encoding
|
attr_accessor :queue, :char_encoding, :errors
|
||||||
|
|
||||||
# Initialises the HTMLInputStream.
|
# Initialises the HTMLInputStream.
|
||||||
#
|
#
|
||||||
|
@ -40,25 +40,31 @@ module HTML5lib
|
||||||
#Number of bytes to use when looking for a meta element with
|
#Number of bytes to use when looking for a meta element with
|
||||||
#encoding information
|
#encoding information
|
||||||
@NUM_BYTES_META = 512
|
@NUM_BYTES_META = 512
|
||||||
|
#Number of bytes to use when using detecting encoding using chardet
|
||||||
|
@NUM_BYTES_CHARDET = 256
|
||||||
|
#Number of bytes to use when reading content
|
||||||
|
@NUM_BYTES_BUFFER = 1024
|
||||||
|
|
||||||
#Encoding to use if no other information can be found
|
#Encoding to use if no other information can be found
|
||||||
@DEFAULT_ENCODING = 'windows-1252'
|
@DEFAULT_ENCODING = 'windows-1252'
|
||||||
|
|
||||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||||
if @encoding.nil? or not HTML5lib.is_valid_encoding(@encoding)
|
if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
|
||||||
@char_encoding = detect_encoding
|
@char_encoding = detect_encoding
|
||||||
else
|
else
|
||||||
@char_encoding = @encoding
|
@char_encoding = @encoding
|
||||||
end
|
end
|
||||||
|
|
||||||
# Read bytes from stream decoding them into Unicode
|
# Read bytes from stream decoding them into Unicode
|
||||||
uString = @raw_stream.read
|
@buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
|
||||||
if @char_encoding == 'windows-1252'
|
if @char_encoding == 'windows-1252'
|
||||||
@win1252 = true
|
@win1252 = true
|
||||||
elsif @char_encoding != 'utf-8'
|
elsif @char_encoding != 'utf-8'
|
||||||
begin
|
begin
|
||||||
require 'iconv'
|
require 'iconv'
|
||||||
begin
|
begin
|
||||||
uString = Iconv.iconv('utf-8', @char_encoding, uString).first
|
@buffer << @raw_stream.read unless @raw_stream.eof?
|
||||||
|
@buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
|
||||||
rescue
|
rescue
|
||||||
@win1252 = true
|
@win1252 = true
|
||||||
end
|
end
|
||||||
|
@ -67,10 +73,8 @@ module HTML5lib
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# Convert the unicode string into a list to be used as the data stream
|
|
||||||
@data_stream = uString
|
|
||||||
|
|
||||||
@queue = []
|
@queue = []
|
||||||
|
@errors = []
|
||||||
|
|
||||||
# Reset position in the list to read from
|
# Reset position in the list to read from
|
||||||
@tell = 0
|
@tell = 0
|
||||||
|
@ -109,9 +113,22 @@ module HTML5lib
|
||||||
begin
|
begin
|
||||||
require 'rubygems'
|
require 'rubygems'
|
||||||
require 'UniversalDetector' # gem install chardet
|
require 'UniversalDetector' # gem install chardet
|
||||||
buffer = @raw_stream.read
|
buffers = []
|
||||||
encoding = UniversalDetector::chardet(buffer)['encoding']
|
detector = UniversalDetector::Detector.instance
|
||||||
seek(buffer, 0)
|
detector.reset
|
||||||
|
until @raw_stream.eof?
|
||||||
|
buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
|
||||||
|
break if !buffer or buffer.empty?
|
||||||
|
buffers << buffer
|
||||||
|
detector.feed(buffer)
|
||||||
|
break if detector.instance_eval {@done}
|
||||||
|
detector.instance_eval {
|
||||||
|
@_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
|
||||||
|
}
|
||||||
|
end
|
||||||
|
detector.close
|
||||||
|
encoding = detector.result['encoding']
|
||||||
|
seek(buffers*'', 0)
|
||||||
rescue LoadError
|
rescue LoadError
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -242,14 +259,20 @@ module HTML5lib
|
||||||
unless @queue.empty?
|
unless @queue.empty?
|
||||||
return @queue.shift
|
return @queue.shift
|
||||||
else
|
else
|
||||||
c = @data_stream[@tell]
|
if @tell + 3 > @buffer.length and !@raw_stream.eof?
|
||||||
|
# read next block
|
||||||
|
@buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
|
||||||
|
@tell = 0
|
||||||
|
end
|
||||||
|
|
||||||
|
c = @buffer[@tell]
|
||||||
@tell += 1
|
@tell += 1
|
||||||
|
|
||||||
case c
|
case c
|
||||||
when 0x01 .. 0x7F
|
when 0x01 .. 0x7F
|
||||||
if c == 0x0D
|
if c == 0x0D
|
||||||
# normalize newlines
|
# normalize newlines
|
||||||
@tell += 1 if @data_stream[@tell] == 0x0A
|
@tell += 1 if @buffer[@tell] == 0x0A
|
||||||
c = 0x0A
|
c = 0x0A
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -276,7 +299,7 @@ module HTML5lib
|
||||||
when 0xC0 .. 0xFF
|
when 0xC0 .. 0xFF
|
||||||
if @win1252
|
if @win1252
|
||||||
"\xC3" + (c-64).chr # convert to utf-8
|
"\xC3" + (c-64).chr # convert to utf-8
|
||||||
elsif @data_stream[@tell-1 .. -1] =~ /^
|
elsif @buffer[@tell-1 .. @tell+3] =~ /^
|
||||||
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
||||||
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
||||||
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
||||||
|
@ -292,6 +315,8 @@ module HTML5lib
|
||||||
end
|
end
|
||||||
|
|
||||||
when 0x00
|
when 0x00
|
||||||
|
@errors.push('null character found in input stream, ' +
|
||||||
|
'replaced with U+FFFD')
|
||||||
[0xFFFD].pack('U') # null characters are invalid
|
[0xFFFD].pack('U') # null characters are invalid
|
||||||
|
|
||||||
else
|
else
|
||||||
|
@ -317,6 +342,10 @@ module HTML5lib
|
||||||
@queue.insert(0, c) unless c == :EOF
|
@queue.insert(0, c) unless c == :EOF
|
||||||
return char_stack.join('')
|
return char_stack.join('')
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def unget(characters)
|
||||||
|
@queue.unshift(*characters.to_a) unless characters == :EOF
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# String-like object with an assosiated position and various extra methods
|
# String-like object with an assosiated position and various extra methods
|
||||||
|
@ -433,14 +462,14 @@ module HTML5lib
|
||||||
|
|
||||||
if attr[0] == 'charset'
|
if attr[0] == 'charset'
|
||||||
tentative_encoding = attr[1]
|
tentative_encoding = attr[1]
|
||||||
if HTML5lib.is_valid_encoding(tentative_encoding)
|
if HTML5.is_valid_encoding(tentative_encoding)
|
||||||
@encoding = tentative_encoding
|
@encoding = tentative_encoding
|
||||||
return false
|
return false
|
||||||
end
|
end
|
||||||
elsif attr[0] == 'content'
|
elsif attr[0] == 'content'
|
||||||
content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
|
content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
|
||||||
tentative_encoding = content_parser.parse
|
tentative_encoding = content_parser.parse
|
||||||
if HTML5lib.is_valid_encoding(tentative_encoding)
|
if HTML5.is_valid_encoding(tentative_encoding)
|
||||||
@encoding = tentative_encoding
|
@encoding = tentative_encoding
|
||||||
return false
|
return false
|
||||||
end
|
end
|
|
@ -11,10 +11,10 @@
|
||||||
#
|
#
|
||||||
# @@TODO:
|
# @@TODO:
|
||||||
# * Selectively lowercase only XHTML, but not foreign markup
|
# * Selectively lowercase only XHTML, but not foreign markup
|
||||||
require 'html5lib/html5parser'
|
require 'html5/html5parser'
|
||||||
require 'html5lib/constants'
|
require 'html5/constants'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
|
|
||||||
# liberal XML parser
|
# liberal XML parser
|
||||||
class XMLParser < HTMLParser
|
class XMLParser < HTMLParser
|
||||||
|
@ -25,25 +25,35 @@ module HTML5lib
|
||||||
end
|
end
|
||||||
|
|
||||||
def normalizeToken(token)
|
def normalizeToken(token)
|
||||||
if token[:type] == :StartTag or token[:type] == :EmptyTag
|
case token[:type]
|
||||||
|
when :StartTag, :EmptyTag
|
||||||
# We need to remove the duplicate attributes and convert attributes
|
# We need to remove the duplicate attributes and convert attributes
|
||||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
# to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||||
|
|
||||||
token[:data] = Hash[*token[:data].reverse.flatten]
|
token[:data] = Hash[*token[:data].reverse.flatten]
|
||||||
|
|
||||||
# For EmptyTags, process both a Start and an End tag
|
# For EmptyTags, process both a Start and an End tag
|
||||||
if token[:type] == :EmptyTag
|
if token[:type] == :EmptyTag
|
||||||
|
save = @tokenizer.contentModelFlag
|
||||||
@phase.processStartTag(token[:name], token[:data])
|
@phase.processStartTag(token[:name], token[:data])
|
||||||
|
@tokenizer.contentModelFlag = save
|
||||||
token[:data] = {}
|
token[:data] = {}
|
||||||
token[:type] = :EndTag
|
token[:type] = :EndTag
|
||||||
end
|
end
|
||||||
|
|
||||||
elsif token[:type] == :EndTag
|
when :Characters
|
||||||
|
# un-escape RCDATA_ELEMENTS (e.g. style, script)
|
||||||
|
if @tokenizer.contentModelFlag == :CDATA
|
||||||
|
token[:data] = token[:data].
|
||||||
|
gsub('<','<').gsub('>','>').gsub('&','&')
|
||||||
|
end
|
||||||
|
|
||||||
|
when :EndTag
|
||||||
if token[:data]
|
if token[:data]
|
||||||
parseError(_("End tag contains unexpected attributes."))
|
parseError(_("End tag contains unexpected attributes."))
|
||||||
end
|
end
|
||||||
|
|
||||||
elsif token[:type] == :Comment
|
when :Comment
|
||||||
# Rescue CDATA from the comments
|
# Rescue CDATA from the comments
|
||||||
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
|
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
|
||||||
token[:type] = :Characters
|
token[:type] = :Characters
|
|
@ -1,6 +1,7 @@
|
||||||
require 'cgi'
|
require 'cgi'
|
||||||
|
require 'html5/tokenizer'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
|
|
||||||
# This module provides sanitization of XHTML+MathML+SVG
|
# This module provides sanitization of XHTML+MathML+SVG
|
||||||
# and of inline style attributes.
|
# and of inline style attributes.
|
2
vendor/plugins/HTML5lib/lib/html5/serializer.rb
vendored
Normal file
2
vendor/plugins/HTML5lib/lib/html5/serializer.rb
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
require 'html5/serializer/htmlserializer'
|
||||||
|
require 'html5/serializer/xhtmlserializer'
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/constants'
|
require 'html5/constants'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
|
|
||||||
class HTMLSerializer
|
class HTMLSerializer
|
||||||
|
|
||||||
|
@ -21,6 +21,7 @@ module HTML5lib
|
||||||
@use_trailing_solidus = false
|
@use_trailing_solidus = false
|
||||||
@space_before_trailing_solidus = true
|
@space_before_trailing_solidus = true
|
||||||
@escape_lt_in_attrs = false
|
@escape_lt_in_attrs = false
|
||||||
|
@escape_rcdata = false
|
||||||
|
|
||||||
@omit_optional_tags = true
|
@omit_optional_tags = true
|
||||||
@sanitize = false
|
@sanitize = false
|
||||||
|
@ -43,22 +44,22 @@ module HTML5lib
|
||||||
@errors = []
|
@errors = []
|
||||||
|
|
||||||
if encoding and @inject_meta_charset
|
if encoding and @inject_meta_charset
|
||||||
require 'html5lib/filters/inject_meta_charset'
|
require 'html5/filters/inject_meta_charset'
|
||||||
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
|
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
|
||||||
end
|
end
|
||||||
|
|
||||||
if @strip_whitespace
|
if @strip_whitespace
|
||||||
require 'html5lib/filters/whitespace'
|
require 'html5/filters/whitespace'
|
||||||
treewalker = Filters::WhitespaceFilter.new(treewalker)
|
treewalker = Filters::WhitespaceFilter.new(treewalker)
|
||||||
end
|
end
|
||||||
|
|
||||||
if @sanitize
|
if @sanitize
|
||||||
require 'html5lib/filters/sanitizer'
|
require 'html5/filters/sanitizer'
|
||||||
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
|
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
|
||||||
end
|
end
|
||||||
|
|
||||||
if @omit_optional_tags
|
if @omit_optional_tags
|
||||||
require 'html5lib/filters/optionaltags'
|
require 'html5/filters/optionaltags'
|
||||||
treewalker = Filters::OptionalTagFilter.new(treewalker)
|
treewalker = Filters::OptionalTagFilter.new(treewalker)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -81,7 +82,7 @@ module HTML5lib
|
||||||
|
|
||||||
elsif [:StartTag, :EmptyTag].include? type
|
elsif [:StartTag, :EmptyTag].include? type
|
||||||
name = token[:name]
|
name = token[:name]
|
||||||
if RCDATA_ELEMENTS.include?(name)
|
if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata
|
||||||
in_cdata = true
|
in_cdata = true
|
||||||
elsif in_cdata
|
elsif in_cdata
|
||||||
serializeError(_("Unexpected child element of a CDATA element"))
|
serializeError(_("Unexpected child element of a CDATA element"))
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/serializer/htmlserializer'
|
require 'html5/serializer/htmlserializer'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
|
|
||||||
class XHTMLSerializer < HTMLSerializer
|
class XHTMLSerializer < HTMLSerializer
|
||||||
DEFAULTS = {
|
DEFAULTS = {
|
||||||
|
@ -8,7 +8,8 @@ module HTML5lib
|
||||||
:minimize_boolean_attributes => false,
|
:minimize_boolean_attributes => false,
|
||||||
:use_trailing_solidus => true,
|
:use_trailing_solidus => true,
|
||||||
:escape_lt_in_attrs => true,
|
:escape_lt_in_attrs => true,
|
||||||
:omit_optional_tags => false
|
:omit_optional_tags => false,
|
||||||
|
:escape_rcdata => true
|
||||||
}
|
}
|
||||||
|
|
||||||
def initialize(options={})
|
def initialize(options={})
|
|
@ -1,7 +1,7 @@
|
||||||
require 'html5lib/constants'
|
require 'html5/constants'
|
||||||
require 'html5lib/inputstream'
|
require 'html5/inputstream'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
|
|
||||||
# This class takes care of tokenizing HTML.
|
# This class takes care of tokenizing HTML.
|
||||||
#
|
#
|
||||||
|
@ -84,9 +84,9 @@ module HTML5lib
|
||||||
# Start processing. When EOF is reached @state will return false
|
# Start processing. When EOF is reached @state will return false
|
||||||
# instead of true and the loop will terminate.
|
# instead of true and the loop will terminate.
|
||||||
while send @state
|
while send @state
|
||||||
while not @tokenQueue.empty?
|
yield :type => :ParseError, :data => @stream.errors.shift until
|
||||||
yield @tokenQueue.shift
|
@stream.errors.empty?
|
||||||
end
|
yield @tokenQueue.shift until @tokenQueue.empty?
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -109,7 +109,7 @@ module HTML5lib
|
||||||
|
|
||||||
# The character we just consumed need to be put back on the stack so it
|
# The character we just consumed need to be put back on the stack so it
|
||||||
# doesn't get lost...
|
# doesn't get lost...
|
||||||
@stream.queue.push(data)
|
@stream.unget(data)
|
||||||
end
|
end
|
||||||
|
|
||||||
# This function returns either U+FFFD or the character based on the
|
# This function returns either U+FFFD or the character based on the
|
||||||
|
@ -128,7 +128,6 @@ module HTML5lib
|
||||||
radix = 16
|
radix = 16
|
||||||
end
|
end
|
||||||
|
|
||||||
char = [0xFFFD].pack('U')
|
|
||||||
charStack = []
|
charStack = []
|
||||||
|
|
||||||
# Consume all the characters that are in range while making sure we
|
# Consume all the characters that are in range while making sure we
|
||||||
|
@ -142,17 +141,25 @@ module HTML5lib
|
||||||
# Convert the set of characters consumed to an int.
|
# Convert the set of characters consumed to an int.
|
||||||
charAsInt = charStack.join('').to_i(radix)
|
charAsInt = charStack.join('').to_i(radix)
|
||||||
|
|
||||||
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
if charAsInt == 13
|
||||||
# smaller) we need to do the "windows trick".
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
if (127...160).include? charAsInt
|
_("Incorrect CR newline entity. Replaced with LF.")})
|
||||||
|
charAsInt = 10
|
||||||
|
elsif (128..159).include? charAsInt
|
||||||
|
# If the integer is between 127 and 160 (so 128 and bigger and 159
|
||||||
|
# and smaller) we need to do the "windows trick".
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("Entity used with illegal number (windows-1252 reference).")})
|
_("Entity used with illegal number (windows-1252 reference).")})
|
||||||
|
|
||||||
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
|
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
|
||||||
end
|
end
|
||||||
|
|
||||||
if charAsInt > 0 and charAsInt <= 1114111
|
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
|
||||||
char = [charAsInt].pack('U')
|
char = [charAsInt].pack('U')
|
||||||
|
else
|
||||||
|
char = [0xFFFD].pack('U')
|
||||||
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
|
_("Numeric entity represents an illegal codepoint.")})
|
||||||
end
|
end
|
||||||
|
|
||||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||||
|
@ -160,18 +167,18 @@ module HTML5lib
|
||||||
if c != ";"
|
if c != ";"
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("Numeric entity didn't end with ';'.")})
|
_("Numeric entity didn't end with ';'.")})
|
||||||
@stream.queue.push(c)
|
@stream.unget(c)
|
||||||
end
|
end
|
||||||
|
|
||||||
return char
|
return char
|
||||||
end
|
end
|
||||||
|
|
||||||
def consumeEntity
|
def consumeEntity(from_attribute=false)
|
||||||
char = nil
|
char = nil
|
||||||
charStack = [@stream.char]
|
charStack = [@stream.char]
|
||||||
if SPACE_CHARACTERS.include?(charStack[0]) or
|
if SPACE_CHARACTERS.include?(charStack[0]) or
|
||||||
[:EOF, '<', '&'].include?(charStack[0])
|
[:EOF, '<', '&'].include?(charStack[0])
|
||||||
@stream.queue+= charStack
|
@stream.unget(charStack)
|
||||||
elsif charStack[0] == "#"
|
elsif charStack[0] == "#"
|
||||||
# We might have a number entity here.
|
# We might have a number entity here.
|
||||||
charStack += [@stream.char, @stream.char]
|
charStack += [@stream.char, @stream.char]
|
||||||
|
@ -179,22 +186,22 @@ module HTML5lib
|
||||||
# If we reach the end of the file put everything up to :EOF
|
# If we reach the end of the file put everything up to :EOF
|
||||||
# back in the queue
|
# back in the queue
|
||||||
charStack = charStack[0...charStack.index(:EOF)]
|
charStack = charStack[0...charStack.index(:EOF)]
|
||||||
@stream.queue+= charStack
|
@stream.unget(charStack)
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("Numeric entity expected. Got end of file instead.")})
|
_("Numeric entity expected. Got end of file instead.")})
|
||||||
else
|
else
|
||||||
if charStack[1].downcase == "x" \
|
if charStack[1].downcase == "x" \
|
||||||
and HEX_DIGITS.include? charStack[2]
|
and HEX_DIGITS.include? charStack[2]
|
||||||
# Hexadecimal entity detected.
|
# Hexadecimal entity detected.
|
||||||
@stream.queue.push(charStack[2])
|
@stream.unget(charStack[2])
|
||||||
char = consumeNumberEntity(true)
|
char = consumeNumberEntity(true)
|
||||||
elsif DIGITS.include? charStack[1]
|
elsif DIGITS.include? charStack[1]
|
||||||
# Decimal entity detected.
|
# Decimal entity detected.
|
||||||
@stream.queue += charStack[1..-1]
|
@stream.unget(charStack[1..-1])
|
||||||
char = consumeNumberEntity(false)
|
char = consumeNumberEntity(false)
|
||||||
else
|
else
|
||||||
# No number entity detected.
|
# No number entity detected.
|
||||||
@stream.queue += charStack
|
@stream.unget(charStack)
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("Numeric entity expected but none found.")})
|
_("Numeric entity expected but none found.")})
|
||||||
end
|
end
|
||||||
|
@ -209,6 +216,8 @@ module HTML5lib
|
||||||
filteredEntityList.reject! {|e| e[0].chr != charStack[0]}
|
filteredEntityList.reject! {|e| e[0].chr != charStack[0]}
|
||||||
entityName = nil
|
entityName = nil
|
||||||
|
|
||||||
|
# Try to find the longest entity the string will match to take care
|
||||||
|
# of ¬i for instance.
|
||||||
while charStack[-1] != :EOF
|
while charStack[-1] != :EOF
|
||||||
name = charStack.join('')
|
name = charStack.join('')
|
||||||
if filteredEntityList.any? {|e| e[0...name.length] == name}
|
if filteredEntityList.any? {|e| e[0...name.length] == name}
|
||||||
|
@ -220,6 +229,7 @@ module HTML5lib
|
||||||
|
|
||||||
if ENTITIES.include? name
|
if ENTITIES.include? name
|
||||||
entityName = name
|
entityName = name
|
||||||
|
break if entityName[-1] == ';'
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -228,15 +238,23 @@ module HTML5lib
|
||||||
|
|
||||||
# Check whether or not the last character returned can be
|
# Check whether or not the last character returned can be
|
||||||
# discarded or needs to be put back.
|
# discarded or needs to be put back.
|
||||||
if not charStack[-1] == ";"
|
if entityName[-1] != ?;
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("Named entity didn't end with ';'.")})
|
_("Named entity didn't end with ';'.")})
|
||||||
@stream.queue += charStack[entityName.length..-1]
|
end
|
||||||
|
|
||||||
|
if charStack[-1] != ";" and from_attribute and
|
||||||
|
(ASCII_LETTERS.include?(charStack[entityName.length]) or
|
||||||
|
DIGITS.include?(charStack[entityName.length]))
|
||||||
|
@stream.unget(charStack)
|
||||||
|
char = '&'
|
||||||
|
else
|
||||||
|
@stream.unget(charStack[entityName.length..-1])
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("Named entity expected. Got none.")})
|
_("Named entity expected. Got none.")})
|
||||||
@stream.queue += charStack
|
@stream.unget(charStack)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
return char
|
return char
|
||||||
|
@ -244,7 +262,7 @@ module HTML5lib
|
||||||
|
|
||||||
# This method replaces the need for "entityInAttributeValueState".
|
# This method replaces the need for "entityInAttributeValueState".
|
||||||
def processEntityInAttribute
|
def processEntityInAttribute
|
||||||
entity = consumeEntity
|
entity = consumeEntity(true)
|
||||||
if entity
|
if entity
|
||||||
@currentToken[:data][-1][1] += entity
|
@currentToken[:data][-1][1] += entity
|
||||||
else
|
else
|
||||||
|
@ -274,20 +292,23 @@ module HTML5lib
|
||||||
@lastFourChars.shift if @lastFourChars.length > 4
|
@lastFourChars.shift if @lastFourChars.length > 4
|
||||||
end
|
end
|
||||||
|
|
||||||
if data == "&" and [:PCDATA,:RCDATA].include?(@contentModelFlag)
|
if data == "&" and !@escapeFlag and
|
||||||
|
[:PCDATA,:RCDATA].include?(@contentModelFlag)
|
||||||
@state = @states[:entityData]
|
@state = @states[:entityData]
|
||||||
|
|
||||||
elsif data == "-" and [:CDATA,:RCDATA].include?(@contentModelFlag) and
|
elsif data == "-" and !@escapeFlag and
|
||||||
@escapeFlag == false and @lastFourChars.join('') == "<!--"
|
[:CDATA,:RCDATA].include?(@contentModelFlag) and
|
||||||
|
@lastFourChars.join('') == "<!--"
|
||||||
@escapeFlag = true
|
@escapeFlag = true
|
||||||
@tokenQueue.push({:type => :Characters, :data => data})
|
@tokenQueue.push({:type => :Characters, :data => data})
|
||||||
|
|
||||||
elsif data == "<" and @escapeFlag == false and
|
elsif data == "<" and !@escapeFlag and
|
||||||
[:PCDATA,:CDATA,:RCDATA].include?(@contentModelFlag)
|
[:PCDATA,:CDATA,:RCDATA].include?(@contentModelFlag)
|
||||||
@state = @states[:tagOpen]
|
@state = @states[:tagOpen]
|
||||||
|
|
||||||
elsif data == ">" and [:CDATA,:RCDATA].include?(@contentModelFlag) and
|
elsif data == ">" and @escapeFlag and
|
||||||
@escapeFlag == true and @lastFourChars[1..-1].join('') == "-->"
|
[:CDATA,:RCDATA].include?(@contentModelFlag) and
|
||||||
|
@lastFourChars[1..-1].join('') == "-->"
|
||||||
@escapeFlag = false
|
@escapeFlag = false
|
||||||
@tokenQueue.push({:type => :Characters, :data => data})
|
@tokenQueue.push({:type => :Characters, :data => data})
|
||||||
|
|
||||||
|
@ -345,14 +366,14 @@ module HTML5lib
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("Expected tag name. Got '?' instead (HTML doesn't " +
|
_("Expected tag name. Got '?' instead (HTML doesn't " +
|
||||||
"support processing instructions).")})
|
"support processing instructions).")})
|
||||||
@stream.queue.push(data)
|
@stream.unget(data)
|
||||||
@state = @states[:bogusComment]
|
@state = @states[:bogusComment]
|
||||||
else
|
else
|
||||||
# XXX
|
# XXX
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("Expected tag name. Got something else instead")})
|
_("Expected tag name. Got something else instead")})
|
||||||
@tokenQueue.push({:type => :Characters, :data => "<"})
|
@tokenQueue.push({:type => :Characters, :data => "<"})
|
||||||
@stream.queue.push(data)
|
@stream.unget(data)
|
||||||
@state = @states[:data]
|
@state = @states[:data]
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
|
@ -363,7 +384,7 @@ module HTML5lib
|
||||||
@state = @states[:closeTagOpen]
|
@state = @states[:closeTagOpen]
|
||||||
else
|
else
|
||||||
@tokenQueue.push({:type => :Characters, :data => "<"})
|
@tokenQueue.push({:type => :Characters, :data => "<"})
|
||||||
@stream.queue.insert(0, data)
|
@stream.unget(data)
|
||||||
@state = @states[:data]
|
@state = @states[:data]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -388,7 +409,7 @@ module HTML5lib
|
||||||
|
|
||||||
# Since this is just for checking. We put the characters back on
|
# Since this is just for checking. We put the characters back on
|
||||||
# the stack.
|
# the stack.
|
||||||
@stream.queue += charStack
|
@stream.unget(charStack)
|
||||||
end
|
end
|
||||||
|
|
||||||
if @currentToken and
|
if @currentToken and
|
||||||
|
@ -426,7 +447,7 @@ module HTML5lib
|
||||||
# XXX data can be _'_...
|
# XXX data can be _'_...
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("Expected closing tag. Unexpected character '#{data}' found.")})
|
_("Expected closing tag. Unexpected character '#{data}' found.")})
|
||||||
@stream.queue.push(data)
|
@stream.unget(data)
|
||||||
@state = @states[:bogusComment]
|
@state = @states[:bogusComment]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -556,7 +577,7 @@ module HTML5lib
|
||||||
@state = @states[:attributeValueDoubleQuoted]
|
@state = @states[:attributeValueDoubleQuoted]
|
||||||
elsif data == "&"
|
elsif data == "&"
|
||||||
@state = @states[:attributeValueUnQuoted]
|
@state = @states[:attributeValueUnQuoted]
|
||||||
@stream.queue.push(data);
|
@stream.unget(data);
|
||||||
elsif data == "'"
|
elsif data == "'"
|
||||||
@state = @states[:attributeValueSingleQuoted]
|
@state = @states[:attributeValueSingleQuoted]
|
||||||
elsif data == ">"
|
elsif data == ">"
|
||||||
|
@ -656,7 +677,7 @@ module HTML5lib
|
||||||
else
|
else
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
||||||
@stream.queue += charStack
|
@stream.unget(charStack)
|
||||||
@state = @states[:bogusComment]
|
@state = @states[:bogusComment]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -771,7 +792,7 @@ module HTML5lib
|
||||||
else
|
else
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("No space after literal string 'DOCTYPE'.")})
|
_("No space after literal string 'DOCTYPE'.")})
|
||||||
@stream.queue.push(data)
|
@stream.unget(data)
|
||||||
@state = @states[:beforeDoctypeName]
|
@state = @states[:beforeDoctypeName]
|
||||||
end
|
end
|
||||||
return true
|
return true
|
||||||
|
@ -827,7 +848,7 @@ module HTML5lib
|
||||||
@state = @states[:data]
|
@state = @states[:data]
|
||||||
elsif data == :EOF
|
elsif data == :EOF
|
||||||
@currentToken[:data] = true
|
@currentToken[:data] = true
|
||||||
@stream.queue.push(data)
|
@stream.unget(data)
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("Unexpected end of file in DOCTYPE.")})
|
_("Unexpected end of file in DOCTYPE.")})
|
||||||
@currentToken[:correct] = false
|
@currentToken[:correct] = false
|
||||||
|
@ -842,7 +863,7 @@ module HTML5lib
|
||||||
elsif token == "system"
|
elsif token == "system"
|
||||||
@state = @states[:beforeDoctypeSystemIdentifier]
|
@state = @states[:beforeDoctypeSystemIdentifier]
|
||||||
else
|
else
|
||||||
@stream.queue += charStack
|
@stream.unget(charStack)
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("Expected 'public' or 'system'. Got '#{charStack.join('')}'")})
|
_("Expected 'public' or 'system'. Got '#{charStack.join('')}'")})
|
||||||
@state = @states[:bogusDoctype]
|
@state = @states[:bogusDoctype]
|
||||||
|
@ -1028,7 +1049,7 @@ module HTML5lib
|
||||||
@state = @states[:data]
|
@state = @states[:data]
|
||||||
elsif data == :EOF
|
elsif data == :EOF
|
||||||
# XXX EMIT
|
# XXX EMIT
|
||||||
@stream.queue.push(data)
|
@stream.unget(data)
|
||||||
@tokenQueue.push({:type => :ParseError, :data =>
|
@tokenQueue.push({:type => :ParseError, :data =>
|
||||||
_("Unexpected end of file in bogus doctype.")})
|
_("Unexpected end of file in bogus doctype.")})
|
||||||
@currentToken[:correct] = false
|
@currentToken[:correct] = false
|
|
@ -1,17 +1,17 @@
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module TreeBuilders
|
module TreeBuilders
|
||||||
|
|
||||||
class << self
|
class << self
|
||||||
def [](name)
|
def [](name)
|
||||||
case name.to_s.downcase
|
case name.to_s.downcase
|
||||||
when 'simpletree' then
|
when 'simpletree' then
|
||||||
require 'html5lib/treebuilders/simpletree'
|
require 'html5/treebuilders/simpletree'
|
||||||
SimpleTree::TreeBuilder
|
SimpleTree::TreeBuilder
|
||||||
when 'rexml' then
|
when 'rexml' then
|
||||||
require 'html5lib/treebuilders/rexml'
|
require 'html5/treebuilders/rexml'
|
||||||
REXML::TreeBuilder
|
REXML::TreeBuilder
|
||||||
when 'hpricot' then
|
when 'hpricot' then
|
||||||
require 'html5lib/treebuilders/hpricot'
|
require 'html5/treebuilders/hpricot'
|
||||||
Hpricot::TreeBuilder
|
Hpricot::TreeBuilder
|
||||||
else
|
else
|
||||||
raise "Unknown TreeBuilder #{name}"
|
raise "Unknown TreeBuilder #{name}"
|
|
@ -1,8 +1,8 @@
|
||||||
require 'html5lib/constants'
|
require 'html5/constants'
|
||||||
|
|
||||||
#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
|
#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
|
|
||||||
# The scope markers are inserted when entering buttons, object elements,
|
# The scope markers are inserted when entering buttons, object elements,
|
||||||
# marquees, table cells, and table captions, and are used to prevent formatting
|
# marquees, table cells, and table captions, and are used to prevent formatting
|
|
@ -1,9 +1,9 @@
|
||||||
require 'html5lib/treebuilders/base'
|
require 'html5/treebuilders/base'
|
||||||
require 'rubygems'
|
require 'rubygems'
|
||||||
require 'hpricot'
|
require 'hpricot'
|
||||||
require 'forwardable'
|
require 'forwardable'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module TreeBuilders
|
module TreeBuilders
|
||||||
module Hpricot
|
module Hpricot
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
require 'html5lib/treebuilders/base'
|
require 'html5/treebuilders/base'
|
||||||
require 'rexml/document'
|
require 'rexml/document'
|
||||||
require 'forwardable'
|
require 'forwardable'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module TreeBuilders
|
module TreeBuilders
|
||||||
module REXML
|
module REXML
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
require 'html5lib/treebuilders/base'
|
require 'html5/treebuilders/base'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module TreeBuilders
|
module TreeBuilders
|
||||||
module SimpleTree
|
module SimpleTree
|
||||||
|
|
|
@ -1,19 +1,19 @@
|
||||||
require 'html5lib/treewalkers/base'
|
require 'html5/treewalkers/base'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module TreeWalkers
|
module TreeWalkers
|
||||||
|
|
||||||
class << self
|
class << self
|
||||||
def [](name)
|
def [](name)
|
||||||
case name.to_s.downcase
|
case name.to_s.downcase
|
||||||
when 'simpletree' then
|
when 'simpletree' then
|
||||||
require 'html5lib/treewalkers/simpletree'
|
require 'html5/treewalkers/simpletree'
|
||||||
SimpleTree::TreeWalker
|
SimpleTree::TreeWalker
|
||||||
when 'rexml' then
|
when 'rexml' then
|
||||||
require 'html5lib/treewalkers/rexml'
|
require 'html5/treewalkers/rexml'
|
||||||
REXML::TreeWalker
|
REXML::TreeWalker
|
||||||
when 'hpricot' then
|
when 'hpricot' then
|
||||||
require 'html5lib/treewalkers/hpricot'
|
require 'html5/treewalkers/hpricot'
|
||||||
Hpricot::TreeWalker
|
Hpricot::TreeWalker
|
||||||
else
|
else
|
||||||
raise "Unknown TreeWalker #{name}"
|
raise "Unknown TreeWalker #{name}"
|
|
@ -1,5 +1,5 @@
|
||||||
require 'html5lib/constants'
|
require 'html5/constants'
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module TreeWalkers
|
module TreeWalkers
|
||||||
|
|
||||||
module TokenConstructor
|
module TokenConstructor
|
|
@ -1,10 +1,10 @@
|
||||||
require 'html5lib/treewalkers/base'
|
require 'html5/treewalkers/base'
|
||||||
require 'rexml/document'
|
require 'rexml/document'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module TreeWalkers
|
module TreeWalkers
|
||||||
module Hpricot
|
module Hpricot
|
||||||
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
|
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
|
||||||
|
|
||||||
def node_details(node)
|
def node_details(node)
|
||||||
case node
|
case node
|
|
@ -1,10 +1,10 @@
|
||||||
require 'html5lib/treewalkers/base'
|
require 'html5/treewalkers/base'
|
||||||
require 'rexml/document'
|
require 'rexml/document'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module TreeWalkers
|
module TreeWalkers
|
||||||
module REXML
|
module REXML
|
||||||
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
|
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
|
||||||
|
|
||||||
def node_details(node)
|
def node_details(node)
|
||||||
case node
|
case node
|
|
@ -1,10 +1,10 @@
|
||||||
require 'html5lib/treewalkers/base'
|
require 'html5/treewalkers/base'
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module TreeWalkers
|
module TreeWalkers
|
||||||
module SimpleTree
|
module SimpleTree
|
||||||
class TreeWalker < HTML5lib::TreeWalkers::Base
|
class TreeWalker < HTML5::TreeWalkers::Base
|
||||||
include HTML5lib::TreeBuilders::SimpleTree
|
include HTML5::TreeBuilders::SimpleTree
|
||||||
|
|
||||||
def walk(node)
|
def walk(node)
|
||||||
case node
|
case node
|
708
vendor/plugins/HTML5lib/lib/html5lib/constants.rb
vendored
708
vendor/plugins/HTML5lib/lib/html5lib/constants.rb
vendored
|
@ -1,708 +0,0 @@
|
||||||
module HTML5lib
|
|
||||||
|
|
||||||
class EOF < Exception; end
|
|
||||||
|
|
||||||
CONTENT_MODEL_FLAGS = [
|
|
||||||
:PCDATA,
|
|
||||||
:RCDATA,
|
|
||||||
:CDATA,
|
|
||||||
:PLAINTEXT
|
|
||||||
]
|
|
||||||
|
|
||||||
SCOPING_ELEMENTS = %w[
|
|
||||||
button
|
|
||||||
caption
|
|
||||||
html
|
|
||||||
marquee
|
|
||||||
object
|
|
||||||
table
|
|
||||||
td
|
|
||||||
th
|
|
||||||
]
|
|
||||||
|
|
||||||
FORMATTING_ELEMENTS = %w[
|
|
||||||
a
|
|
||||||
b
|
|
||||||
big
|
|
||||||
em
|
|
||||||
font
|
|
||||||
i
|
|
||||||
nobr
|
|
||||||
s
|
|
||||||
small
|
|
||||||
strike
|
|
||||||
strong
|
|
||||||
tt
|
|
||||||
u
|
|
||||||
]
|
|
||||||
|
|
||||||
SPECIAL_ELEMENTS = %w[
|
|
||||||
address
|
|
||||||
area
|
|
||||||
base
|
|
||||||
basefont
|
|
||||||
bgsound
|
|
||||||
blockquote
|
|
||||||
body
|
|
||||||
br
|
|
||||||
center
|
|
||||||
col
|
|
||||||
colgroup
|
|
||||||
dd
|
|
||||||
dir
|
|
||||||
div
|
|
||||||
dl
|
|
||||||
dt
|
|
||||||
embed
|
|
||||||
fieldset
|
|
||||||
form
|
|
||||||
frame
|
|
||||||
frameset
|
|
||||||
h1
|
|
||||||
h2
|
|
||||||
h3
|
|
||||||
h4
|
|
||||||
h5
|
|
||||||
h6
|
|
||||||
head
|
|
||||||
hr
|
|
||||||
iframe
|
|
||||||
image
|
|
||||||
img
|
|
||||||
input
|
|
||||||
isindex
|
|
||||||
li
|
|
||||||
link
|
|
||||||
listing
|
|
||||||
menu
|
|
||||||
meta
|
|
||||||
noembed
|
|
||||||
noframes
|
|
||||||
noscript
|
|
||||||
ol
|
|
||||||
optgroup
|
|
||||||
option
|
|
||||||
p
|
|
||||||
param
|
|
||||||
plaintext
|
|
||||||
pre
|
|
||||||
script
|
|
||||||
select
|
|
||||||
spacer
|
|
||||||
style
|
|
||||||
tbody
|
|
||||||
textarea
|
|
||||||
tfoot
|
|
||||||
thead
|
|
||||||
title
|
|
||||||
tr
|
|
||||||
ul
|
|
||||||
wbr
|
|
||||||
]
|
|
||||||
|
|
||||||
SPACE_CHARACTERS = %W[
|
|
||||||
\t
|
|
||||||
\n
|
|
||||||
\x0B
|
|
||||||
\x0C
|
|
||||||
\x20
|
|
||||||
\r
|
|
||||||
]
|
|
||||||
|
|
||||||
TABLE_INSERT_MODE_ELEMENTS = %w[
|
|
||||||
table
|
|
||||||
tbody
|
|
||||||
tfoot
|
|
||||||
thead
|
|
||||||
tr
|
|
||||||
]
|
|
||||||
|
|
||||||
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
|
|
||||||
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
|
|
||||||
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
|
|
||||||
DIGITS = '0'..'9'
|
|
||||||
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
|
|
||||||
|
|
||||||
# Heading elements need to be ordered
|
|
||||||
HEADING_ELEMENTS = %w[
|
|
||||||
h1
|
|
||||||
h2
|
|
||||||
h3
|
|
||||||
h4
|
|
||||||
h5
|
|
||||||
h6
|
|
||||||
]
|
|
||||||
|
|
||||||
# XXX What about event-source and command?
|
|
||||||
VOID_ELEMENTS = %w[
|
|
||||||
base
|
|
||||||
link
|
|
||||||
meta
|
|
||||||
hr
|
|
||||||
br
|
|
||||||
img
|
|
||||||
embed
|
|
||||||
param
|
|
||||||
area
|
|
||||||
col
|
|
||||||
input
|
|
||||||
]
|
|
||||||
|
|
||||||
CDATA_ELEMENTS = %w[title textarea]
|
|
||||||
|
|
||||||
RCDATA_ELEMENTS = %w[
|
|
||||||
style
|
|
||||||
script
|
|
||||||
xmp
|
|
||||||
iframe
|
|
||||||
noembed
|
|
||||||
noframes
|
|
||||||
noscript
|
|
||||||
]
|
|
||||||
|
|
||||||
BOOLEAN_ATTRIBUTES = {
|
|
||||||
:global => %w[irrelevant],
|
|
||||||
'style' => %w[scoped],
|
|
||||||
'img' => %w[ismap],
|
|
||||||
'audio' => %w[autoplay controls],
|
|
||||||
'video' => %w[autoplay controls],
|
|
||||||
'script' => %w[defer async],
|
|
||||||
'details' => %w[open],
|
|
||||||
'datagrid' => %w[multiple disabled],
|
|
||||||
'command' => %w[hidden disabled checked default],
|
|
||||||
'menu' => %w[autosubmit],
|
|
||||||
'fieldset' => %w[disabled readonly],
|
|
||||||
'option' => %w[disabled readonly selected],
|
|
||||||
'optgroup' => %w[disabled readonly],
|
|
||||||
'button' => %w[disabled autofocus],
|
|
||||||
'input' => %w[disabled readonly required autofocus checked ismap],
|
|
||||||
'select' => %w[disabled readonly autofocus multiple],
|
|
||||||
'output' => %w[disabled readonly]
|
|
||||||
}
|
|
||||||
|
|
||||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
|
|
||||||
ENTITIES_WINDOWS1252 = [
|
|
||||||
8364, # 0x80 0x20AC EURO SIGN
|
|
||||||
65533, # 0x81 UNDEFINED
|
|
||||||
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
|
||||||
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
|
||||||
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
|
||||||
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
|
||||||
8224, # 0x86 0x2020 DAGGER
|
|
||||||
8225, # 0x87 0x2021 DOUBLE DAGGER
|
|
||||||
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
|
||||||
8240, # 0x89 0x2030 PER MILLE SIGN
|
|
||||||
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
|
||||||
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
|
||||||
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
|
||||||
65533, # 0x8D UNDEFINED
|
|
||||||
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
|
||||||
65533, # 0x8F UNDEFINED
|
|
||||||
65533, # 0x90 UNDEFINED
|
|
||||||
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
|
||||||
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
|
||||||
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
|
||||||
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
|
||||||
8226, # 0x95 0x2022 BULLET
|
|
||||||
8211, # 0x96 0x2013 EN DASH
|
|
||||||
8212, # 0x97 0x2014 EM DASH
|
|
||||||
732, # 0x98 0x02DC SMALL TILDE
|
|
||||||
8482, # 0x99 0x2122 TRADE MARK SIGN
|
|
||||||
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
|
||||||
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
|
||||||
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
|
||||||
65533, # 0x9D UNDEFINED
|
|
||||||
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
|
||||||
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
|
||||||
]
|
|
||||||
|
|
||||||
private
|
|
||||||
|
|
||||||
def self.U n
|
|
||||||
[n].pack('U')
|
|
||||||
end
|
|
||||||
|
|
||||||
public
|
|
||||||
|
|
||||||
ENTITIES = {
|
|
||||||
"AElig" => U(0xC6),
|
|
||||||
"Aacute" => U(0xC1),
|
|
||||||
"Acirc" => U(0xC2),
|
|
||||||
"Agrave" => U(0xC0),
|
|
||||||
"Alpha" => U(0x0391),
|
|
||||||
"Aring" => U(0xC5),
|
|
||||||
"Atilde" => U(0xC3),
|
|
||||||
"Auml" => U(0xC4),
|
|
||||||
"Beta" => U(0x0392),
|
|
||||||
"Ccedil" => U(0xC7),
|
|
||||||
"Chi" => U(0x03A7),
|
|
||||||
"Dagger" => U(0x2021),
|
|
||||||
"Delta" => U(0x0394),
|
|
||||||
"ETH" => U(0xD0),
|
|
||||||
"Eacute" => U(0xC9),
|
|
||||||
"Ecirc" => U(0xCA),
|
|
||||||
"Egrave" => U(0xC8),
|
|
||||||
"Epsilon" => U(0x0395),
|
|
||||||
"Eta" => U(0x0397),
|
|
||||||
"Euml" => U(0xCB),
|
|
||||||
"Gamma" => U(0x0393),
|
|
||||||
"Iacute" => U(0xCD),
|
|
||||||
"Icirc" => U(0xCE),
|
|
||||||
"Igrave" => U(0xCC),
|
|
||||||
"Iota" => U(0x0399),
|
|
||||||
"Iuml" => U(0xCF),
|
|
||||||
"Kappa" => U(0x039A),
|
|
||||||
"Lambda" => U(0x039B),
|
|
||||||
"Mu" => U(0x039C),
|
|
||||||
"Ntilde" => U(0xD1),
|
|
||||||
"Nu" => U(0x039D),
|
|
||||||
"OElig" => U(0x0152),
|
|
||||||
"Oacute" => U(0xD3),
|
|
||||||
"Ocirc" => U(0xD4),
|
|
||||||
"Ograve" => U(0xD2),
|
|
||||||
"Omega" => U(0x03A9),
|
|
||||||
"Omicron" => U(0x039F),
|
|
||||||
"Oslash" => U(0xD8),
|
|
||||||
"Otilde" => U(0xD5),
|
|
||||||
"Ouml" => U(0xD6),
|
|
||||||
"Phi" => U(0x03A6),
|
|
||||||
"Pi" => U(0x03A0),
|
|
||||||
"Prime" => U(0x2033),
|
|
||||||
"Psi" => U(0x03A8),
|
|
||||||
"Rho" => U(0x03A1),
|
|
||||||
"Scaron" => U(0x0160),
|
|
||||||
"Sigma" => U(0x03A3),
|
|
||||||
"THORN" => U(0xDE),
|
|
||||||
"Tau" => U(0x03A4),
|
|
||||||
"Theta" => U(0x0398),
|
|
||||||
"Uacute" => U(0xDA),
|
|
||||||
"Ucirc" => U(0xDB),
|
|
||||||
"Ugrave" => U(0xD9),
|
|
||||||
"Upsilon" => U(0x03A5),
|
|
||||||
"Uuml" => U(0xDC),
|
|
||||||
"Xi" => U(0x039E),
|
|
||||||
"Yacute" => U(0xDD),
|
|
||||||
"Yuml" => U(0x0178),
|
|
||||||
"Zeta" => U(0x0396),
|
|
||||||
"aacute" => U(0xE1),
|
|
||||||
"acirc" => U(0xE2),
|
|
||||||
"acute" => U(0xB4),
|
|
||||||
"aelig" => U(0xE6),
|
|
||||||
"agrave" => U(0xE0),
|
|
||||||
"alefsym" => U(0x2135),
|
|
||||||
"alpha" => U(0x03B1),
|
|
||||||
"amp" => U(0x26),
|
|
||||||
"AMP" => U(0x26),
|
|
||||||
"and" => U(0x2227),
|
|
||||||
"ang" => U(0x2220),
|
|
||||||
"apos" => U(0x27),
|
|
||||||
"aring" => U(0xE5),
|
|
||||||
"asymp" => U(0x2248),
|
|
||||||
"atilde" => U(0xE3),
|
|
||||||
"auml" => U(0xE4),
|
|
||||||
"bdquo" => U(0x201E),
|
|
||||||
"beta" => U(0x03B2),
|
|
||||||
"brvbar" => U(0xA6),
|
|
||||||
"bull" => U(0x2022),
|
|
||||||
"cap" => U(0x2229),
|
|
||||||
"ccedil" => U(0xE7),
|
|
||||||
"cedil" => U(0xB8),
|
|
||||||
"cent" => U(0xA2),
|
|
||||||
"chi" => U(0x03C7),
|
|
||||||
"circ" => U(0x02C6),
|
|
||||||
"clubs" => U(0x2663),
|
|
||||||
"cong" => U(0x2245),
|
|
||||||
"copy" => U(0xA9),
|
|
||||||
"COPY" => U(0xA9),
|
|
||||||
"crarr" => U(0x21B5),
|
|
||||||
"cup" => U(0x222A),
|
|
||||||
"curren" => U(0xA4),
|
|
||||||
"dArr" => U(0x21D3),
|
|
||||||
"dagger" => U(0x2020),
|
|
||||||
"darr" => U(0x2193),
|
|
||||||
"deg" => U(0xB0),
|
|
||||||
"delta" => U(0x03B4),
|
|
||||||
"diams" => U(0x2666),
|
|
||||||
"divide" => U(0xF7),
|
|
||||||
"eacute" => U(0xE9),
|
|
||||||
"ecirc" => U(0xEA),
|
|
||||||
"egrave" => U(0xE8),
|
|
||||||
"empty" => U(0x2205),
|
|
||||||
"emsp" => U(0x2003),
|
|
||||||
"ensp" => U(0x2002),
|
|
||||||
"epsilon" => U(0x03B5),
|
|
||||||
"equiv" => U(0x2261),
|
|
||||||
"eta" => U(0x03B7),
|
|
||||||
"eth" => U(0xF0),
|
|
||||||
"euml" => U(0xEB),
|
|
||||||
"euro" => U(0x20AC),
|
|
||||||
"exist" => U(0x2203),
|
|
||||||
"fnof" => U(0x0192),
|
|
||||||
"forall" => U(0x2200),
|
|
||||||
"frac12" => U(0xBD),
|
|
||||||
"frac14" => U(0xBC),
|
|
||||||
"frac34" => U(0xBE),
|
|
||||||
"frasl" => U(0x2044),
|
|
||||||
"gamma" => U(0x03B3),
|
|
||||||
"ge" => U(0x2265),
|
|
||||||
"gt" => U(0x3E),
|
|
||||||
"GT" => U(0x3E),
|
|
||||||
"hArr" => U(0x21D4),
|
|
||||||
"harr" => U(0x2194),
|
|
||||||
"hearts" => U(0x2665),
|
|
||||||
"hellip" => U(0x2026),
|
|
||||||
"iacute" => U(0xED),
|
|
||||||
"icirc" => U(0xEE),
|
|
||||||
"iexcl" => U(0xA1),
|
|
||||||
"igrave" => U(0xEC),
|
|
||||||
"image" => U(0x2111),
|
|
||||||
"infin" => U(0x221E),
|
|
||||||
"int" => U(0x222B),
|
|
||||||
"iota" => U(0x03B9),
|
|
||||||
"iquest" => U(0xBF),
|
|
||||||
"isin" => U(0x2208),
|
|
||||||
"iuml" => U(0xEF),
|
|
||||||
"kappa" => U(0x03BA),
|
|
||||||
"lArr" => U(0x21D0),
|
|
||||||
"lambda" => U(0x03BB),
|
|
||||||
"lang" => U(0x2329),
|
|
||||||
"laquo" => U(0xAB),
|
|
||||||
"larr" => U(0x2190),
|
|
||||||
"lceil" => U(0x2308),
|
|
||||||
"ldquo" => U(0x201C),
|
|
||||||
"le" => U(0x2264),
|
|
||||||
"lfloor" => U(0x230A),
|
|
||||||
"lowast" => U(0x2217),
|
|
||||||
"loz" => U(0x25CA),
|
|
||||||
"lrm" => U(0x200E),
|
|
||||||
"lsaquo" => U(0x2039),
|
|
||||||
"lsquo" => U(0x2018),
|
|
||||||
"lt" => U(0x3C),
|
|
||||||
"LT" => U(0x3C),
|
|
||||||
"macr" => U(0xAF),
|
|
||||||
"mdash" => U(0x2014),
|
|
||||||
"micro" => U(0xB5),
|
|
||||||
"middot" => U(0xB7),
|
|
||||||
"minus" => U(0x2212),
|
|
||||||
"mu" => U(0x03BC),
|
|
||||||
"nabla" => U(0x2207),
|
|
||||||
"nbsp" => U(0xA0),
|
|
||||||
"ndash" => U(0x2013),
|
|
||||||
"ne" => U(0x2260),
|
|
||||||
"ni" => U(0x220B),
|
|
||||||
"not" => U(0xAC),
|
|
||||||
"notin" => U(0x2209),
|
|
||||||
"nsub" => U(0x2284),
|
|
||||||
"ntilde" => U(0xF1),
|
|
||||||
"nu" => U(0x03BD),
|
|
||||||
"oacute" => U(0xF3),
|
|
||||||
"ocirc" => U(0xF4),
|
|
||||||
"oelig" => U(0x0153),
|
|
||||||
"ograve" => U(0xF2),
|
|
||||||
"oline" => U(0x203E),
|
|
||||||
"omega" => U(0x03C9),
|
|
||||||
"omicron" => U(0x03BF),
|
|
||||||
"oplus" => U(0x2295),
|
|
||||||
"or" => U(0x2228),
|
|
||||||
"ordf" => U(0xAA),
|
|
||||||
"ordm" => U(0xBA),
|
|
||||||
"oslash" => U(0xF8),
|
|
||||||
"otilde" => U(0xF5),
|
|
||||||
"otimes" => U(0x2297),
|
|
||||||
"ouml" => U(0xF6),
|
|
||||||
"para" => U(0xB6),
|
|
||||||
"part" => U(0x2202),
|
|
||||||
"permil" => U(0x2030),
|
|
||||||
"perp" => U(0x22A5),
|
|
||||||
"phi" => U(0x03C6),
|
|
||||||
"pi" => U(0x03C0),
|
|
||||||
"piv" => U(0x03D6),
|
|
||||||
"plusmn" => U(0xB1),
|
|
||||||
"pound" => U(0xA3),
|
|
||||||
"prime" => U(0x2032),
|
|
||||||
"prod" => U(0x220F),
|
|
||||||
"prop" => U(0x221D),
|
|
||||||
"psi" => U(0x03C8),
|
|
||||||
"quot" => U(0x22),
|
|
||||||
"QUOT" => U(0x22),
|
|
||||||
"rArr" => U(0x21D2),
|
|
||||||
"radic" => U(0x221A),
|
|
||||||
"rang" => U(0x232A),
|
|
||||||
"raquo" => U(0xBB),
|
|
||||||
"rarr" => U(0x2192),
|
|
||||||
"rceil" => U(0x2309),
|
|
||||||
"rdquo" => U(0x201D),
|
|
||||||
"real" => U(0x211C),
|
|
||||||
"reg" => U(0xAE),
|
|
||||||
"REG" => U(0xAE),
|
|
||||||
"rfloor" => U(0x230B),
|
|
||||||
"rho" => U(0x03C1),
|
|
||||||
"rlm" => U(0x200F),
|
|
||||||
"rsaquo" => U(0x203A),
|
|
||||||
"rsquo" => U(0x2019),
|
|
||||||
"sbquo" => U(0x201A),
|
|
||||||
"scaron" => U(0x0161),
|
|
||||||
"sdot" => U(0x22C5),
|
|
||||||
"sect" => U(0xA7),
|
|
||||||
"shy" => U(0xAD),
|
|
||||||
"sigma" => U(0x03C3),
|
|
||||||
"sigmaf" => U(0x03C2),
|
|
||||||
"sim" => U(0x223C),
|
|
||||||
"spades" => U(0x2660),
|
|
||||||
"sub" => U(0x2282),
|
|
||||||
"sube" => U(0x2286),
|
|
||||||
"sum" => U(0x2211),
|
|
||||||
"sup" => U(0x2283),
|
|
||||||
"sup1" => U(0xB9),
|
|
||||||
"sup2" => U(0xB2),
|
|
||||||
"sup3" => U(0xB3),
|
|
||||||
"supe" => U(0x2287),
|
|
||||||
"szlig" => U(0xDF),
|
|
||||||
"tau" => U(0x03C4),
|
|
||||||
"there4" => U(0x2234),
|
|
||||||
"theta" => U(0x03B8),
|
|
||||||
"thetasym" => U(0x03D1),
|
|
||||||
"thinsp" => U(0x2009),
|
|
||||||
"thorn" => U(0xFE),
|
|
||||||
"tilde" => U(0x02DC),
|
|
||||||
"times" => U(0xD7),
|
|
||||||
"trade" => U(0x2122),
|
|
||||||
"uArr" => U(0x21D1),
|
|
||||||
"uacute" => U(0xFA),
|
|
||||||
"uarr" => U(0x2191),
|
|
||||||
"ucirc" => U(0xFB),
|
|
||||||
"ugrave" => U(0xF9),
|
|
||||||
"uml" => U(0xA8),
|
|
||||||
"upsih" => U(0x03D2),
|
|
||||||
"upsilon" => U(0x03C5),
|
|
||||||
"uuml" => U(0xFC),
|
|
||||||
"weierp" => U(0x2118),
|
|
||||||
"xi" => U(0x03BE),
|
|
||||||
"yacute" => U(0xFD),
|
|
||||||
"yen" => U(0xA5),
|
|
||||||
"yuml" => U(0xFF),
|
|
||||||
"zeta" => U(0x03B6),
|
|
||||||
"zwj" => U(0x200D),
|
|
||||||
"zwnj" => U(0x200C)
|
|
||||||
}
|
|
||||||
|
|
||||||
ENCODINGS = %w[
|
|
||||||
ansi_x3.4-1968
|
|
||||||
iso-ir-6
|
|
||||||
ansi_x3.4-1986
|
|
||||||
iso_646.irv:1991
|
|
||||||
ascii
|
|
||||||
iso646-us
|
|
||||||
us-ascii
|
|
||||||
us
|
|
||||||
ibm367
|
|
||||||
cp367
|
|
||||||
csascii
|
|
||||||
ks_c_5601-1987
|
|
||||||
korean
|
|
||||||
iso-2022-kr
|
|
||||||
csiso2022kr
|
|
||||||
euc-kr
|
|
||||||
iso-2022-jp
|
|
||||||
csiso2022jp
|
|
||||||
iso-2022-jp-2
|
|
||||||
iso-ir-58
|
|
||||||
chinese
|
|
||||||
csiso58gb231280
|
|
||||||
iso_8859-1:1987
|
|
||||||
iso-ir-100
|
|
||||||
iso_8859-1
|
|
||||||
iso-8859-1
|
|
||||||
latin1
|
|
||||||
l1
|
|
||||||
ibm819
|
|
||||||
cp819
|
|
||||||
csisolatin1
|
|
||||||
iso_8859-2:1987
|
|
||||||
iso-ir-101
|
|
||||||
iso_8859-2
|
|
||||||
iso-8859-2
|
|
||||||
latin2
|
|
||||||
l2
|
|
||||||
csisolatin2
|
|
||||||
iso_8859-3:1988
|
|
||||||
iso-ir-109
|
|
||||||
iso_8859-3
|
|
||||||
iso-8859-3
|
|
||||||
latin3
|
|
||||||
l3
|
|
||||||
csisolatin3
|
|
||||||
iso_8859-4:1988
|
|
||||||
iso-ir-110
|
|
||||||
iso_8859-4
|
|
||||||
iso-8859-4
|
|
||||||
latin4
|
|
||||||
l4
|
|
||||||
csisolatin4
|
|
||||||
iso_8859-6:1987
|
|
||||||
iso-ir-127
|
|
||||||
iso_8859-6
|
|
||||||
iso-8859-6
|
|
||||||
ecma-114
|
|
||||||
asmo-708
|
|
||||||
arabic
|
|
||||||
csisolatinarabic
|
|
||||||
iso_8859-7:1987
|
|
||||||
iso-ir-126
|
|
||||||
iso_8859-7
|
|
||||||
iso-8859-7
|
|
||||||
elot_928
|
|
||||||
ecma-118
|
|
||||||
greek
|
|
||||||
greek8
|
|
||||||
csisolatingreek
|
|
||||||
iso_8859-8:1988
|
|
||||||
iso-ir-138
|
|
||||||
iso_8859-8
|
|
||||||
iso-8859-8
|
|
||||||
hebrew
|
|
||||||
csisolatinhebrew
|
|
||||||
iso_8859-5:1988
|
|
||||||
iso-ir-144
|
|
||||||
iso_8859-5
|
|
||||||
iso-8859-5
|
|
||||||
cyrillic
|
|
||||||
csisolatincyrillic
|
|
||||||
iso_8859-9:1989
|
|
||||||
iso-ir-148
|
|
||||||
iso_8859-9
|
|
||||||
iso-8859-9
|
|
||||||
latin5
|
|
||||||
l5
|
|
||||||
csisolatin5
|
|
||||||
iso-8859-10
|
|
||||||
iso-ir-157
|
|
||||||
l6
|
|
||||||
iso_8859-10:1992
|
|
||||||
csisolatin6
|
|
||||||
latin6
|
|
||||||
hp-roman8
|
|
||||||
roman8
|
|
||||||
r8
|
|
||||||
ibm037
|
|
||||||
cp037
|
|
||||||
csibm037
|
|
||||||
ibm424
|
|
||||||
cp424
|
|
||||||
csibm424
|
|
||||||
ibm437
|
|
||||||
cp437
|
|
||||||
437
|
|
||||||
cspc8codepage437
|
|
||||||
ibm500
|
|
||||||
cp500
|
|
||||||
csibm500
|
|
||||||
ibm775
|
|
||||||
cp775
|
|
||||||
cspc775baltic
|
|
||||||
ibm850
|
|
||||||
cp850
|
|
||||||
850
|
|
||||||
cspc850multilingual
|
|
||||||
ibm852
|
|
||||||
cp852
|
|
||||||
852
|
|
||||||
cspcp852
|
|
||||||
ibm855
|
|
||||||
cp855
|
|
||||||
855
|
|
||||||
csibm855
|
|
||||||
ibm857
|
|
||||||
cp857
|
|
||||||
857
|
|
||||||
csibm857
|
|
||||||
ibm860
|
|
||||||
cp860
|
|
||||||
860
|
|
||||||
csibm860
|
|
||||||
ibm861
|
|
||||||
cp861
|
|
||||||
861
|
|
||||||
cp-is
|
|
||||||
csibm861
|
|
||||||
ibm862
|
|
||||||
cp862
|
|
||||||
862
|
|
||||||
cspc862latinhebrew
|
|
||||||
ibm863
|
|
||||||
cp863
|
|
||||||
863
|
|
||||||
csibm863
|
|
||||||
ibm864
|
|
||||||
cp864
|
|
||||||
csibm864
|
|
||||||
ibm865
|
|
||||||
cp865
|
|
||||||
865
|
|
||||||
csibm865
|
|
||||||
ibm866
|
|
||||||
cp866
|
|
||||||
866
|
|
||||||
csibm866
|
|
||||||
ibm869
|
|
||||||
cp869
|
|
||||||
869
|
|
||||||
cp-gr
|
|
||||||
csibm869
|
|
||||||
ibm1026
|
|
||||||
cp1026
|
|
||||||
csibm1026
|
|
||||||
koi8-r
|
|
||||||
cskoi8r
|
|
||||||
koi8-u
|
|
||||||
big5-hkscs
|
|
||||||
ptcp154
|
|
||||||
csptcp154
|
|
||||||
pt154
|
|
||||||
cp154
|
|
||||||
utf-7
|
|
||||||
utf-16be
|
|
||||||
utf-16le
|
|
||||||
utf-16
|
|
||||||
utf-8
|
|
||||||
iso-8859-13
|
|
||||||
iso-8859-14
|
|
||||||
iso-ir-199
|
|
||||||
iso_8859-14:1998
|
|
||||||
iso_8859-14
|
|
||||||
latin8
|
|
||||||
iso-celtic
|
|
||||||
l8
|
|
||||||
iso-8859-15
|
|
||||||
iso_8859-15
|
|
||||||
iso-8859-16
|
|
||||||
iso-ir-226
|
|
||||||
iso_8859-16:2001
|
|
||||||
iso_8859-16
|
|
||||||
latin10
|
|
||||||
l10
|
|
||||||
gbk
|
|
||||||
cp936
|
|
||||||
ms936
|
|
||||||
gb18030
|
|
||||||
shift_jis
|
|
||||||
ms_kanji
|
|
||||||
csshiftjis
|
|
||||||
euc-jp
|
|
||||||
gb2312
|
|
||||||
big5
|
|
||||||
csbig5
|
|
||||||
windows-1250
|
|
||||||
windows-1251
|
|
||||||
windows-1252
|
|
||||||
windows-1253
|
|
||||||
windows-1254
|
|
||||||
windows-1255
|
|
||||||
windows-1256
|
|
||||||
windows-1257
|
|
||||||
windows-1258
|
|
||||||
tis-620
|
|
||||||
hz-gb-2312
|
|
||||||
]
|
|
||||||
|
|
||||||
end
|
|
|
@ -1 +0,0 @@
|
||||||
require 'html5lib/filters/optionaltags'
|
|
|
@ -1,2 +0,0 @@
|
||||||
require 'html5lib/serializer/htmlserializer'
|
|
||||||
require 'html5lib/serializer/xhtmlserializer'
|
|
24
vendor/plugins/HTML5lib/parse.rb
vendored
24
vendor/plugins/HTML5lib/parse.rb
vendored
|
@ -26,15 +26,15 @@ def parse(opts, args)
|
||||||
exit(1)
|
exit(1)
|
||||||
end
|
end
|
||||||
|
|
||||||
require 'html5lib/treebuilders'
|
require 'html5/treebuilders'
|
||||||
treebuilder = HTML5lib::TreeBuilders[opts.treebuilder]
|
treebuilder = HTML5::TreeBuilders[opts.treebuilder]
|
||||||
|
|
||||||
if opts.output == :xml
|
if opts.output == :xml
|
||||||
require 'html5lib/liberalxmlparser'
|
require 'html5/liberalxmlparser'
|
||||||
p = HTML5lib::XHTMLParser.new(:tree=>treebuilder)
|
p = HTML5::XHTMLParser.new(:tree=>treebuilder)
|
||||||
else
|
else
|
||||||
require 'html5lib/html5parser'
|
require 'html5/html5parser'
|
||||||
p = HTML5lib::HTMLParser.new(:tree=>treebuilder)
|
p = HTML5::HTMLParser.new(:tree=>treebuilder)
|
||||||
end
|
end
|
||||||
|
|
||||||
if opts.parsemethod == :parse
|
if opts.parsemethod == :parse
|
||||||
|
@ -70,10 +70,10 @@ def printOutput(parser, document, opts)
|
||||||
when :xml
|
when :xml
|
||||||
print document
|
print document
|
||||||
when :html
|
when :html
|
||||||
require 'html5lib/treewalkers'
|
require 'html5/treewalkers'
|
||||||
tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
|
tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
|
||||||
require 'html5lib/serializer'
|
require 'html5/serializer'
|
||||||
puts HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer)
|
puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
|
||||||
when :hilite
|
when :hilite
|
||||||
print document.hilite
|
print document.hilite
|
||||||
when :tree
|
when :tree
|
||||||
|
@ -188,6 +188,10 @@ opts = OptionParser.new do |opts|
|
||||||
options.serializer[:escape_lt_in_attrs] = lt
|
options.serializer[:escape_lt_in_attrs] = lt
|
||||||
end
|
end
|
||||||
|
|
||||||
|
opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
|
||||||
|
options.serializer[:escape_rcdata] = rcdata
|
||||||
|
end
|
||||||
|
|
||||||
opts.separator ""
|
opts.separator ""
|
||||||
opts.separator "Other Options:"
|
opts.separator "Other Options:"
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,6 @@ EUC-jp
|
||||||
#encoding
|
#encoding
|
||||||
EUC-jp
|
EUC-jp
|
||||||
|
|
||||||
|
|
||||||
#data
|
#data
|
||||||
<!-- -->
|
<!-- -->
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||||
|
|
|
@ -92,7 +92,8 @@
|
||||||
|
|
||||||
{"description": "rcdata",
|
{"description": "rcdata",
|
||||||
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
|
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
|
||||||
"expected": ["<script>a<b>c&d"]
|
"expected": ["<script>a<b>c&d"],
|
||||||
|
"xhtml": ["<script>a<b>c&d"]
|
||||||
},
|
},
|
||||||
|
|
||||||
{"description": "doctype",
|
{"description": "doctype",
|
||||||
|
|
|
@ -49,6 +49,12 @@
|
||||||
"options": {"escape_lt_in_attrs": true},
|
"options": {"escape_lt_in_attrs": true},
|
||||||
"input": [["StartTag", "a", {"title": "a<b>c&d"}]],
|
"input": [["StartTag", "a", {"title": "a<b>c&d"}]],
|
||||||
"expected": ["<a title=\"a<b>c&d\">"]
|
"expected": ["<a title=\"a<b>c&d\">"]
|
||||||
|
},
|
||||||
|
|
||||||
|
{"description": "rcdata",
|
||||||
|
"options": {"escape_rcdata": true},
|
||||||
|
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
|
||||||
|
"expected": ["<script>a<b>c&d"]
|
||||||
}
|
}
|
||||||
|
|
||||||
]}
|
]}
|
||||||
|
|
|
@ -135,7 +135,7 @@
|
||||||
|
|
||||||
{"description":"Entity without trailing semicolon (2)",
|
{"description":"Entity without trailing semicolon (2)",
|
||||||
"input":"I'm ¬in",
|
"input":"I'm ¬in",
|
||||||
"output":[["Character","I'm "], "ParseError", ["Character", "∉"]]},
|
"output":[["Character","I'm "], "ParseError", ["Character", "¬in"]]},
|
||||||
|
|
||||||
{"description":"Partial entity match at end of file",
|
{"description":"Partial entity match at end of file",
|
||||||
"input":"I'm &no",
|
"input":"I'm &no",
|
||||||
|
@ -151,6 +151,18 @@
|
||||||
|
|
||||||
{"description":"Hexadecimal entity in attribute",
|
{"description":"Hexadecimal entity in attribute",
|
||||||
"input":"<h a='?'></h>",
|
"input":"<h a='?'></h>",
|
||||||
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]}
|
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]},
|
||||||
|
|
||||||
|
{"description":"Entity in attribute without semicolon ending in x",
|
||||||
|
"input":"<h a='¬x'>",
|
||||||
|
"output":["ParseError", ["StartTag", "h", {"a":"¬x"}]]},
|
||||||
|
|
||||||
|
{"description":"Entity in attribute without semicolon ending in 1",
|
||||||
|
"input":"<h a='¬1'>",
|
||||||
|
"output":["ParseError", ["StartTag", "h", {"a":"¬1"}]]},
|
||||||
|
|
||||||
|
{"description":"Entity in attribute without semicolon",
|
||||||
|
"input":"<h a='©'>",
|
||||||
|
"output":["ParseError", ["StartTag", "h", {"a":"©"}]]}
|
||||||
|
|
||||||
]}
|
]}
|
||||||
|
|
|
@ -42,19 +42,23 @@
|
||||||
|
|
||||||
{"description":"Numeric entity representing the NUL character",
|
{"description":"Numeric entity representing the NUL character",
|
||||||
"input":"�",
|
"input":"�",
|
||||||
"output":[["Character", "\uFFFD"]]},
|
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||||
|
|
||||||
{"description":"Hexadecimal entity representing the NUL character",
|
{"description":"Hexadecimal entity representing the NUL character",
|
||||||
"input":"�",
|
"input":"�",
|
||||||
"output":[["Character", "\uFFFD"]]},
|
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||||
|
|
||||||
{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
|
{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
|
||||||
"input":"�",
|
"input":"�",
|
||||||
"output":[["Character", "\uFFFD"]]},
|
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||||
|
|
||||||
{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
|
{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
|
||||||
"input":"�",
|
"input":"�",
|
||||||
"output":[["Character", "\uFFFD"]]},
|
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||||
|
|
||||||
|
{"description":"Hexadecimal entity pair representing a surrogate pair",
|
||||||
|
"input":"��",
|
||||||
|
"output":["ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"]]},
|
||||||
|
|
||||||
{"description":"Numeric entity representing a Windows-1252 'codepoint'",
|
{"description":"Numeric entity representing a Windows-1252 'codepoint'",
|
||||||
"input":"‰",
|
"input":"‰",
|
||||||
|
@ -118,7 +122,7 @@
|
||||||
|
|
||||||
{"description":"Null Byte Replacement",
|
{"description":"Null Byte Replacement",
|
||||||
"input":"\u0000",
|
"input":"\u0000",
|
||||||
"output":[["Character", "\ufffd"]]}
|
"output":["ParseError", ["Character", "\ufffd"]]}
|
||||||
|
|
||||||
]}
|
]}
|
||||||
|
|
||||||
|
|
|
@ -285,6 +285,7 @@ Line1<br>Line2<br>Line3<br>Line4
|
||||||
| <div>
|
| <div>
|
||||||
| <b>
|
| <b>
|
||||||
| <marquee>
|
| <marquee>
|
||||||
|
| <p>
|
||||||
| "X"
|
| "X"
|
||||||
|
|
||||||
#data
|
#data
|
||||||
|
@ -330,6 +331,7 @@ Unexpected end of file
|
||||||
| <body>
|
| <body>
|
||||||
| <p>
|
| <p>
|
||||||
| <hr>
|
| <hr>
|
||||||
|
| <p>
|
||||||
|
|
||||||
#data
|
#data
|
||||||
<select><b><option><select><option></b></select>X
|
<select><b><option><select><option></b></select>X
|
||||||
|
@ -1369,13 +1371,14 @@ unexpected EOF
|
||||||
<head></p><meta><p>
|
<head></p><meta><p>
|
||||||
#errors
|
#errors
|
||||||
6: missing document type declaration
|
6: missing document type declaration
|
||||||
10: unexpected p element end tag in head
|
10: unexpected p element end tag
|
||||||
#document
|
#document
|
||||||
| <html>
|
| <html>
|
||||||
| <head>
|
| <head>
|
||||||
| <meta>
|
|
||||||
| <body>
|
| <body>
|
||||||
| <p>
|
| <p>
|
||||||
|
| <meta>
|
||||||
|
| <p>
|
||||||
|
|
||||||
#data
|
#data
|
||||||
<head></html><meta><p>
|
<head></html><meta><p>
|
||||||
|
@ -1485,6 +1488,7 @@ unexpected EOF
|
||||||
| <div>
|
| <div>
|
||||||
| <b>
|
| <b>
|
||||||
| <marquee>
|
| <marquee>
|
||||||
|
| <p>
|
||||||
|
|
||||||
#data
|
#data
|
||||||
<script></script></div><title></title><p><p>
|
<script></script></div><title></title><p><p>
|
||||||
|
@ -1511,6 +1515,7 @@ unexpected EOF
|
||||||
| <body>
|
| <body>
|
||||||
| <p>
|
| <p>
|
||||||
| <hr>
|
| <hr>
|
||||||
|
| <p>
|
||||||
|
|
||||||
#data
|
#data
|
||||||
<select><b><option><select><option></b></select>
|
<select><b><option><select><option></b></select>
|
||||||
|
@ -1807,6 +1812,7 @@ Unexpected EOF
|
||||||
| <head>
|
| <head>
|
||||||
| <body>
|
| <body>
|
||||||
| <br>
|
| <br>
|
||||||
|
| <p>
|
||||||
|
|
||||||
#data
|
#data
|
||||||
<table><tr></strong></b></em></i></u></strike></s></blink></tt></pre></big></small></font></select></h1></h2></h3></h4></h5></h6></body></br></a></img></title></span></style></script></table></th></td></tr></frame></area></link></param></hr></input></col></base></meta></basefont></bgsound></embed></spacer></p></dd></dt></caption></colgroup></tbody></tfoot></thead></address></blockquote></center></dir></div></dl></fieldset></listing></menu></ol></ul></li></nobr></wbr></form></button></marquee></object></html></frameset></head></iframe></image></isindex></noembed></noframes></noscript></optgroup></option></plaintext></textarea>
|
<table><tr></strong></b></em></i></u></strike></s></blink></tt></pre></big></small></font></select></h1></h2></h3></h4></h5></h6></body></br></a></img></title></span></style></script></table></th></td></tr></frame></area></link></param></hr></input></col></base></meta></basefont></bgsound></embed></spacer></p></dd></dt></caption></colgroup></tbody></tfoot></thead></address></blockquote></center></dir></div></dl></fieldset></listing></menu></ol></ul></li></nobr></wbr></form></button></marquee></object></html></frameset></head></iframe></image></isindex></noembed></noframes></noscript></optgroup></option></plaintext></textarea>
|
||||||
|
@ -1928,3 +1934,4 @@ Unexpected EOF
|
||||||
| <table>
|
| <table>
|
||||||
| <tbody>
|
| <tbody>
|
||||||
| <tr>
|
| <tr>
|
||||||
|
| <p>
|
||||||
|
|
|
@ -777,3 +777,4 @@ Unexpected </p> end tag.
|
||||||
| <tbody>
|
| <tbody>
|
||||||
| <tr>
|
| <tr>
|
||||||
| <td>
|
| <td>
|
||||||
|
| <p>
|
||||||
|
|
|
@ -61,7 +61,6 @@ No DOCTYPE
|
||||||
|
|
||||||
#data
|
#data
|
||||||
<!DOCTYPE htML><html><head></head><body><pre>
|
<!DOCTYPE htML><html><head></head><body><pre>
|
||||||
|
|
||||||
foo</pre></body></html>
|
foo</pre></body></html>
|
||||||
#errors
|
#errors
|
||||||
#document
|
#document
|
||||||
|
@ -72,10 +71,22 @@ foo</pre></body></html>
|
||||||
| <pre>
|
| <pre>
|
||||||
| "foo"
|
| "foo"
|
||||||
|
|
||||||
|
|
||||||
#data
|
#data
|
||||||
<!DOCTYPE htML><html><head></head><body><pre>
|
<!DOCTYPE htML><html><head></head><body><pre>
|
||||||
|
|
||||||
|
foo</pre></body></html>
|
||||||
|
#errors
|
||||||
|
#document
|
||||||
|
| <!DOCTYPE htML>
|
||||||
|
| <html>
|
||||||
|
| <head>
|
||||||
|
| <body>
|
||||||
|
| <pre>
|
||||||
|
| "
|
||||||
|
foo"
|
||||||
|
|
||||||
|
#data
|
||||||
|
<!DOCTYPE htML><html><head></head><body><pre>
|
||||||
foo
|
foo
|
||||||
</pre></body></html>
|
</pre></body></html>
|
||||||
#errors
|
#errors
|
||||||
|
@ -183,7 +194,6 @@ y</pre></body></html>
|
||||||
|
|
||||||
#data
|
#data
|
||||||
<!DOCTYPE htML><textarea>
|
<!DOCTYPE htML><textarea>
|
||||||
|
|
||||||
foo</textarea>
|
foo</textarea>
|
||||||
#errors
|
#errors
|
||||||
#document
|
#document
|
||||||
|
@ -194,6 +204,20 @@ foo</textarea>
|
||||||
| <textarea>
|
| <textarea>
|
||||||
| "foo"
|
| "foo"
|
||||||
|
|
||||||
|
#data
|
||||||
|
<!DOCTYPE htML><textarea>
|
||||||
|
|
||||||
|
foo</textarea>
|
||||||
|
#errors
|
||||||
|
#document
|
||||||
|
| <!DOCTYPE htML>
|
||||||
|
| <html>
|
||||||
|
| <head>
|
||||||
|
| <body>
|
||||||
|
| <textarea>
|
||||||
|
| "
|
||||||
|
foo"
|
||||||
|
|
||||||
#data
|
#data
|
||||||
<!DOCTYPE htML><html><head></head><body><ul><li><div><p><li></ul></body></html>
|
<!DOCTYPE htML><html><head></head><body><ul><li><div><p><li></ul></body></html>
|
||||||
#errors
|
#errors
|
||||||
|
|
|
@ -1,37 +1,49 @@
|
||||||
#data
|
#data
|
||||||
direct div content
|
direct div content
|
||||||
#errors
|
#errors
|
||||||
#document-fragment div
|
#document-fragment
|
||||||
|
div
|
||||||
|
#document
|
||||||
| "direct div content"
|
| "direct div content"
|
||||||
|
|
||||||
#data
|
#data
|
||||||
direct textarea content
|
direct textarea content
|
||||||
#errors
|
#errors
|
||||||
#document-fragment textarea
|
#document-fragment
|
||||||
|
textarea
|
||||||
|
#document
|
||||||
| "direct textarea content"
|
| "direct textarea content"
|
||||||
|
|
||||||
#data
|
#data
|
||||||
textarea content with <em>pseudo</em> <foo>markup
|
textarea content with <em>pseudo</em> <foo>markup
|
||||||
#errors
|
#errors
|
||||||
#document-fragment textarea
|
#document-fragment
|
||||||
|
textarea
|
||||||
|
#document
|
||||||
| "textarea content with <em>pseudo</em> <foo>markup"
|
| "textarea content with <em>pseudo</em> <foo>markup"
|
||||||
|
|
||||||
#data
|
#data
|
||||||
this is CDATA inside a <style> element
|
this is CDATA inside a <style> element
|
||||||
#errors
|
#errors
|
||||||
#document-fragment style
|
#document-fragment
|
||||||
|
style
|
||||||
|
#document
|
||||||
| "this is CDATA inside a <style> element"
|
| "this is CDATA inside a <style> element"
|
||||||
|
|
||||||
#data
|
#data
|
||||||
</plaintext>
|
</plaintext>
|
||||||
#errors
|
#errors
|
||||||
#document-fragment plaintext
|
#document-fragment
|
||||||
|
plaintext
|
||||||
|
#document
|
||||||
| "</plaintext>"
|
| "</plaintext>"
|
||||||
|
|
||||||
#data
|
#data
|
||||||
setting html's innerHTML
|
setting html's innerHTML
|
||||||
#errors
|
#errors
|
||||||
#document-fragment html
|
#document-fragment
|
||||||
|
html
|
||||||
|
#document
|
||||||
| <head>
|
| <head>
|
||||||
| <body>
|
| <body>
|
||||||
| "setting html's innerHTML"
|
| "setting html's innerHTML"
|
||||||
|
@ -39,6 +51,8 @@ setting html's innerHTML
|
||||||
#data
|
#data
|
||||||
<title>setting head's innerHTML</title>
|
<title>setting head's innerHTML</title>
|
||||||
#errors
|
#errors
|
||||||
#document-fragment head
|
#document-fragment
|
||||||
|
head
|
||||||
|
#document
|
||||||
| <title>
|
| <title>
|
||||||
| "setting head's innerHTML"
|
| "setting head's innerHTML"
|
||||||
|
|
|
@ -27,3 +27,41 @@
|
||||||
| <head>
|
| <head>
|
||||||
| <body>
|
| <body>
|
||||||
| <meta>
|
| <meta>
|
||||||
|
|
||||||
|
#data
|
||||||
|
<!doctype HTml><form><div></form><div>
|
||||||
|
#errors
|
||||||
|
Form end tag ignored.
|
||||||
|
Unexpected end of file.
|
||||||
|
#document
|
||||||
|
| <!DOCTYPE HTml>
|
||||||
|
| <html>
|
||||||
|
| <head>
|
||||||
|
| <body>
|
||||||
|
| <form>
|
||||||
|
| <div>
|
||||||
|
| <div>
|
||||||
|
|
||||||
|
#data
|
||||||
|
<!doctype HTml><title>&</title>
|
||||||
|
#errors
|
||||||
|
Unexpected end of file.
|
||||||
|
#document
|
||||||
|
| <!DOCTYPE HTml>
|
||||||
|
| <html>
|
||||||
|
| <head>
|
||||||
|
| <title>
|
||||||
|
| "&"
|
||||||
|
| <body>
|
||||||
|
|
||||||
|
#data
|
||||||
|
<!doctype HTml><title><!--&--></title>
|
||||||
|
#errors
|
||||||
|
Unexpected end of file.
|
||||||
|
#document
|
||||||
|
| <!DOCTYPE HTml>
|
||||||
|
| <html>
|
||||||
|
| <head>
|
||||||
|
| <title>
|
||||||
|
| "<!--&-->"
|
||||||
|
| <body>
|
||||||
|
|
78
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
78
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
|
@ -1,9 +1,9 @@
|
||||||
require 'test/unit'
|
require 'test/unit'
|
||||||
|
|
||||||
HTML5LIB_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
|
HTML5_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
|
||||||
|
|
||||||
if File.exists?(File.join(HTML5LIB_BASE, 'testdata'))
|
if File.exists?(File.join(HTML5_BASE, 'testdata'))
|
||||||
TESTDATA_DIR = File.join(HTML5LIB_BASE, 'testdata')
|
TESTDATA_DIR = File.join(HTML5_BASE, 'testdata')
|
||||||
else
|
else
|
||||||
TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
|
TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
|
||||||
end
|
end
|
||||||
|
@ -12,7 +12,7 @@ $:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
|
||||||
|
|
||||||
$:.unshift File.dirname(__FILE__)
|
$:.unshift File.dirname(__FILE__)
|
||||||
|
|
||||||
def html5lib_test_files(subdirectory)
|
def html5_test_files(subdirectory)
|
||||||
Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
|
Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -30,42 +30,8 @@ rescue LoadError
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
module HTML5lib
|
module HTML5
|
||||||
module TestSupport
|
module TestSupport
|
||||||
def self.startswith?(a, b)
|
|
||||||
b[0... a.length] == a
|
|
||||||
end
|
|
||||||
|
|
||||||
def self.parseTestcase(data)
|
|
||||||
innerHTML = nil
|
|
||||||
input = []
|
|
||||||
output = []
|
|
||||||
errors = []
|
|
||||||
currentList = input
|
|
||||||
data.split(/\n/).each do |line|
|
|
||||||
if !line.empty? and !startswith?("#errors", line) and
|
|
||||||
!startswith?("#document", line) and
|
|
||||||
!startswith?("#data", line) and
|
|
||||||
!startswith?("#document-fragment", line)
|
|
||||||
|
|
||||||
if currentList == output and startswith?("|", line)
|
|
||||||
currentList.push(line[2..-1])
|
|
||||||
else
|
|
||||||
currentList.push(line)
|
|
||||||
end
|
|
||||||
elsif line == "#errors"
|
|
||||||
currentList = errors
|
|
||||||
elsif line == "#document" or startswith?("#document-fragment", line)
|
|
||||||
if startswith?("#document-fragment", line)
|
|
||||||
innerHTML = line[19..-1]
|
|
||||||
raise AssertionError unless innerHTML
|
|
||||||
end
|
|
||||||
currentList = output
|
|
||||||
end
|
|
||||||
end
|
|
||||||
return innerHTML, input.join("\n"), output.join("\n"), errors
|
|
||||||
end
|
|
||||||
|
|
||||||
# convert the output of str(document) to the format used in the testcases
|
# convert the output of str(document) to the format used in the testcases
|
||||||
def convertTreeDump(treedump)
|
def convertTreeDump(treedump)
|
||||||
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
|
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
|
||||||
|
@ -77,5 +43,39 @@ module HTML5lib
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
class TestData
|
||||||
|
include Enumerable
|
||||||
|
|
||||||
|
def initialize(filename, sections)
|
||||||
|
@f = open(filename)
|
||||||
|
@sections = sections
|
||||||
|
end
|
||||||
|
|
||||||
|
def each
|
||||||
|
data = {}
|
||||||
|
key=nil
|
||||||
|
@f.each_line do |line|
|
||||||
|
if line[0] == ?# and @sections.include?(line[1..-2])
|
||||||
|
heading = line[1..-2]
|
||||||
|
if data.any? and heading == @sections[0]
|
||||||
|
data[key].chomp! #Remove trailing newline
|
||||||
|
yield normaliseOutput(data)
|
||||||
|
data = {}
|
||||||
|
end
|
||||||
|
key = heading
|
||||||
|
data[key]=""
|
||||||
|
elsif key
|
||||||
|
data[key] += line
|
||||||
|
end
|
||||||
|
end
|
||||||
|
yield normaliseOutput(data) if data
|
||||||
|
end
|
||||||
|
|
||||||
|
def normaliseOutput(data)
|
||||||
|
#Remove trailing newlines
|
||||||
|
data.keys.each { |key| data[key].chomp! }
|
||||||
|
@sections.map {|heading| data[heading]}
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
16
vendor/plugins/HTML5lib/tests/test_encoding.rb
vendored
16
vendor/plugins/HTML5lib/tests/test_encoding.rb
vendored
|
@ -1,8 +1,10 @@
|
||||||
require File.join(File.dirname(__FILE__), 'preamble')
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
require 'html5lib/inputstream'
|
require 'html5/inputstream'
|
||||||
|
|
||||||
class Html5EncodingTestCase < Test::Unit::TestCase
|
class Html5EncodingTestCase < Test::Unit::TestCase
|
||||||
|
include HTML5
|
||||||
|
include TestSupport
|
||||||
|
|
||||||
begin
|
begin
|
||||||
require 'rubygems'
|
require 'rubygems'
|
||||||
|
@ -10,23 +12,21 @@ class Html5EncodingTestCase < Test::Unit::TestCase
|
||||||
|
|
||||||
def test_chardet
|
def test_chardet
|
||||||
file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r')
|
file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r')
|
||||||
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
|
stream = HTML5::HTMLInputStream.new(file, :chardet => true)
|
||||||
assert_equal 'big5', stream.char_encoding.downcase
|
assert_equal 'big5', stream.char_encoding.downcase
|
||||||
rescue LoadError
|
rescue LoadError
|
||||||
puts "chardet not found, skipping chardet tests"
|
puts "chardet not found, skipping chardet tests"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
html5lib_test_files('encoding').each do |test_file|
|
html5_test_files('encoding').each do |test_file|
|
||||||
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
|
||||||
|
|
||||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
TestData.new(test_file, %w(data encoding)).
|
||||||
next if data.empty?
|
each_with_index do |(input, encoding), index|
|
||||||
input, encoding = data.split(/\n#encoding\s+/, 2)
|
|
||||||
encoding = encoding.split[0]
|
|
||||||
|
|
||||||
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
|
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
|
||||||
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
|
stream = HTML5::HTMLInputStream.new(input, :chardet => false)
|
||||||
assert_equal encoding.downcase, stream.char_encoding.downcase, input
|
assert_equal encoding.downcase, stream.char_encoding.downcase, input
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
75
vendor/plugins/HTML5lib/tests/test_lxp.rb
vendored
75
vendor/plugins/HTML5lib/tests/test_lxp.rb
vendored
|
@ -1,23 +1,23 @@
|
||||||
require File.join(File.dirname(__FILE__), 'preamble')
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
require 'html5lib/liberalxmlparser'
|
require 'html5/liberalxmlparser'
|
||||||
|
|
||||||
XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
|
XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
|
||||||
SORTATTRS = '<#{$1+$2.split.sort.join(' ')+$3}>'
|
|
||||||
|
|
||||||
def assert_xml_equal(input, expected=nil, parser=HTML5lib::XMLParser)
|
def assert_xml_equal(input, expected=nil, parser=HTML5::XMLParser)
|
||||||
|
sortattrs = proc {"<#{$1+$2.split.sort.join(' ')+$3}>"}
|
||||||
document = parser.parse(input.chomp).root
|
document = parser.parse(input.chomp).root
|
||||||
if not expected
|
if not expected
|
||||||
expected = input.chomp.gsub(XMLELEM,SORTATTRS)
|
expected = input.chomp.gsub(XMLELEM,&sortattrs)
|
||||||
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
|
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
|
||||||
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
|
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,&sortattrs)
|
||||||
assert_equal(expected, output)
|
assert_equal(expected, output)
|
||||||
else
|
else
|
||||||
assert_equal(expected, document.to_s.gsub(/'/,'"'))
|
assert_equal(expected, document.to_s.gsub(/'/,'"'))
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def assert_xhtml_equal(input, expected=nil, parser=HTML5lib::XHTMLParser)
|
def assert_xhtml_equal(input, expected=nil, parser=HTML5::XHTMLParser)
|
||||||
assert_xml_equal(input, expected, parser)
|
assert_xml_equal(input, expected, parser)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -34,10 +34,10 @@ class BasicXhtml5Test < Test::Unit::TestCase
|
||||||
|
|
||||||
def test_title_body_named_charref
|
def test_title_body_named_charref
|
||||||
assert_xhtml_equal(
|
assert_xhtml_equal(
|
||||||
'<title>mdash</title>A &mdash B',
|
'<title>ntilde</title>A ñ B',
|
||||||
'<html xmlns="http://www.w3.org/1999/xhtml">' +
|
'<html xmlns="http://www.w3.org/1999/xhtml">' +
|
||||||
'<head><title>mdash</title></head>' +
|
'<head><title>ntilde</title></head>' +
|
||||||
'<body>A '+ [0x2014].pack('U') + ' B</body>' +
|
'<body>A '+ [0xF1].pack('U') + ' B</body>' +
|
||||||
'</html>')
|
'</html>')
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -193,20 +193,71 @@ EOX
|
||||||
def test_br
|
def test_br
|
||||||
assert_xhtml_equal <<EOX1
|
assert_xhtml_equal <<EOX1
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
<head><title>XLINK</title></head>
|
<head><title>BR</title></head>
|
||||||
<body>
|
<body>
|
||||||
<br/>
|
<br/>
|
||||||
</body></html>
|
</body></html>
|
||||||
EOX1
|
EOX1
|
||||||
end
|
end
|
||||||
|
|
||||||
def xtest_strong
|
def test_strong
|
||||||
assert_xhtml_equal <<EOX
|
assert_xhtml_equal <<EOX
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
<head><title>XLINK</title></head>
|
<head><title>STRONG</title></head>
|
||||||
<body>
|
<body>
|
||||||
<strong></strong>
|
<strong></strong>
|
||||||
</body></html>
|
</body></html>
|
||||||
EOX
|
EOX
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_script
|
||||||
|
assert_xhtml_equal <<EOX
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>SCRIPT</title></head>
|
||||||
|
<body>
|
||||||
|
<script>1 < 2 & 3</script>
|
||||||
|
</body></html>
|
||||||
|
EOX
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_script_src
|
||||||
|
assert_xhtml_equal <<EOX1, <<EOX2.strip
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>SCRIPT</title><script src="http://example.com"/></head>
|
||||||
|
<body>
|
||||||
|
<script>1 < 2 & 3</script>
|
||||||
|
</body></html>
|
||||||
|
EOX1
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>SCRIPT</title><script src="http://example.com"></script></head>
|
||||||
|
<body>
|
||||||
|
<script>1 < 2 & 3</script>
|
||||||
|
</body></html>
|
||||||
|
EOX2
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_title
|
||||||
|
assert_xhtml_equal <<EOX
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>1 < 2 & 3</title></head>
|
||||||
|
<body>
|
||||||
|
</body></html>
|
||||||
|
EOX
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_prolog
|
||||||
|
assert_xhtml_equal <<EOX1, <<EOX2.strip
|
||||||
|
<?xml version="1.0" encoding="UTF-8" ?>
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>PROLOG</title></head>
|
||||||
|
<body>
|
||||||
|
</body></html>
|
||||||
|
EOX1
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head><title>PROLOG</title></head>
|
||||||
|
<body>
|
||||||
|
</body></html>
|
||||||
|
EOX2
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
23
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
23
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
|
@ -1,7 +1,7 @@
|
||||||
require File.join(File.dirname(__FILE__), 'preamble')
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
require 'html5lib/treebuilders'
|
require 'html5/treebuilders'
|
||||||
require 'html5lib/html5parser'
|
require 'html5/html5parser'
|
||||||
|
|
||||||
|
|
||||||
$tree_types_to_test = ['simpletree', 'rexml']
|
$tree_types_to_test = ['simpletree', 'rexml']
|
||||||
|
@ -18,18 +18,17 @@ puts 'Testing tree builders: ' + $tree_types_to_test * ', '
|
||||||
|
|
||||||
|
|
||||||
class Html5ParserTestCase < Test::Unit::TestCase
|
class Html5ParserTestCase < Test::Unit::TestCase
|
||||||
include HTML5lib
|
include HTML5
|
||||||
include TestSupport
|
include TestSupport
|
||||||
|
|
||||||
html5lib_test_files('tree-construction').each do |test_file|
|
html5_test_files('tree-construction').each do |test_file|
|
||||||
|
|
||||||
test_name = File.basename(test_file).sub('.dat', '')
|
test_name = File.basename(test_file).sub('.dat', '')
|
||||||
|
|
||||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
TestData.new(test_file, %w(data errors document-fragment document)).
|
||||||
next if data.empty?
|
each_with_index do |(input, errors, innerHTML, expected), index|
|
||||||
|
|
||||||
innerHTML, input, expected_output, expected_errors =
|
expected = expected.gsub("\n| ","\n")[2..-1]
|
||||||
TestSupport.parseTestcase(data)
|
|
||||||
|
|
||||||
$tree_types_to_test.each do |tree_name|
|
$tree_types_to_test.each do |tree_name|
|
||||||
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
|
||||||
|
@ -44,9 +43,9 @@ class Html5ParserTestCase < Test::Unit::TestCase
|
||||||
|
|
||||||
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
|
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
|
||||||
|
|
||||||
assert_equal sortattrs(expected_output), sortattrs(actual_output), [
|
assert_equal sortattrs(expected), sortattrs(actual_output), [
|
||||||
'', 'Input:', input,
|
'', 'Input:', input,
|
||||||
'', 'Expected:', expected_output,
|
'', 'Expected:', expected,
|
||||||
'', 'Recieved:', actual_output
|
'', 'Recieved:', actual_output
|
||||||
].join("\n")
|
].join("\n")
|
||||||
|
|
||||||
|
@ -54,9 +53,9 @@ class Html5ParserTestCase < Test::Unit::TestCase
|
||||||
actual_errors = parser.errors.map do |(line, col), message|
|
actual_errors = parser.errors.map do |(line, col), message|
|
||||||
'Line: %i Col: %i %s' % [line, col, message]
|
'Line: %i Col: %i %s' % [line, col, message]
|
||||||
end
|
end
|
||||||
assert_equal expected_errors.length, parser.errors.length, [
|
assert_equal errors.length, parser.errors.length, [
|
||||||
'Input', input + "\n",
|
'Input', input + "\n",
|
||||||
'Expected errors:', expected_errors.join("\n"),
|
'Expected errors:', errors.join("\n"),
|
||||||
'Actual errors:', actual_errors.join("\n")
|
'Actual errors:', actual_errors.join("\n")
|
||||||
].join("\n")
|
].join("\n")
|
||||||
end
|
end
|
||||||
|
|
14
vendor/plugins/HTML5lib/tests/test_sanitizer.rb
vendored
14
vendor/plugins/HTML5lib/tests/test_sanitizer.rb
vendored
|
@ -2,14 +2,14 @@
|
||||||
|
|
||||||
require File.join(File.dirname(__FILE__), 'preamble')
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
require 'html5lib/html5parser'
|
require 'html5/html5parser'
|
||||||
require 'html5lib/liberalxmlparser'
|
require 'html5/liberalxmlparser'
|
||||||
require 'html5lib/treewalkers'
|
require 'html5/treewalkers'
|
||||||
require 'html5lib/serializer'
|
require 'html5/serializer'
|
||||||
require 'html5lib/sanitizer'
|
require 'html5/sanitizer'
|
||||||
|
|
||||||
class SanitizeTest < Test::Unit::TestCase
|
class SanitizeTest < Test::Unit::TestCase
|
||||||
include HTML5lib
|
include HTML5
|
||||||
|
|
||||||
def sanitize_xhtml stream
|
def sanitize_xhtml stream
|
||||||
XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).to_s
|
XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).to_s
|
||||||
|
@ -131,7 +131,7 @@ class SanitizeTest < Test::Unit::TestCase
|
||||||
# check_sanitization(input, output, output, output)
|
# check_sanitization(input, output, output, output)
|
||||||
# end
|
# end
|
||||||
|
|
||||||
html5lib_test_files('sanitizer').each do |filename|
|
html5_test_files('sanitizer').each do |filename|
|
||||||
JSON::parse(open(filename).read).each do |test|
|
JSON::parse(open(filename).read).each do |test|
|
||||||
define_method "test_#{test['name']}" do
|
define_method "test_#{test['name']}" do
|
||||||
check_sanitization(
|
check_sanitization(
|
||||||
|
|
14
vendor/plugins/HTML5lib/tests/test_serializer.rb
vendored
14
vendor/plugins/HTML5lib/tests/test_serializer.rb
vendored
|
@ -1,13 +1,13 @@
|
||||||
require File.join(File.dirname(__FILE__), 'preamble')
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
require 'html5lib/html5parser'
|
require 'html5/html5parser'
|
||||||
require 'html5lib/serializer'
|
require 'html5/serializer'
|
||||||
require 'html5lib/treewalkers'
|
require 'html5/treewalkers'
|
||||||
|
|
||||||
#Run the serialize error checks
|
#Run the serialize error checks
|
||||||
checkSerializeErrors = false
|
checkSerializeErrors = false
|
||||||
|
|
||||||
class JsonWalker < HTML5lib::TreeWalkers::Base
|
class JsonWalker < HTML5::TreeWalkers::Base
|
||||||
def each
|
def each
|
||||||
@tree.each do |token|
|
@tree.each do |token|
|
||||||
case token[0]
|
case token[0]
|
||||||
|
@ -31,7 +31,7 @@ class JsonWalker < HTML5lib::TreeWalkers::Base
|
||||||
end
|
end
|
||||||
|
|
||||||
class Html5SerializeTestcase < Test::Unit::TestCase
|
class Html5SerializeTestcase < Test::Unit::TestCase
|
||||||
html5lib_test_files('serializer').each do |filename|
|
html5_test_files('serializer').each do |filename|
|
||||||
test_name = File.basename(filename).sub('.test', '')
|
test_name = File.basename(filename).sub('.test', '')
|
||||||
tests = JSON::parse(open(filename).read)
|
tests = JSON::parse(open(filename).read)
|
||||||
tests['tests'].each_with_index do |test, index|
|
tests['tests'].each_with_index do |test, index|
|
||||||
|
@ -41,7 +41,7 @@ class Html5SerializeTestcase < Test::Unit::TestCase
|
||||||
test["options"][:encoding] = test["options"]["encoding"]
|
test["options"][:encoding] = test["options"]["encoding"]
|
||||||
end
|
end
|
||||||
|
|
||||||
result = HTML5lib::HTMLSerializer.
|
result = HTML5::HTMLSerializer.
|
||||||
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
||||||
expected = test["expected"]
|
expected = test["expected"]
|
||||||
if expected.length == 1
|
if expected.length == 1
|
||||||
|
@ -52,7 +52,7 @@ class Html5SerializeTestcase < Test::Unit::TestCase
|
||||||
|
|
||||||
return if test_name == 'optionaltags'
|
return if test_name == 'optionaltags'
|
||||||
|
|
||||||
result = HTML5lib::XHTMLSerializer.
|
result = HTML5::XHTMLSerializer.
|
||||||
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
||||||
expected = test["xhtml"] || test["expected"]
|
expected = test["xhtml"] || test["expected"]
|
||||||
if expected.length == 1
|
if expected.length == 1
|
||||||
|
|
4
vendor/plugins/HTML5lib/tests/test_stream.rb
vendored
4
vendor/plugins/HTML5lib/tests/test_stream.rb
vendored
|
@ -1,9 +1,9 @@
|
||||||
require File.join(File.dirname(__FILE__), 'preamble')
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
require 'html5lib/inputstream'
|
require 'html5/inputstream'
|
||||||
|
|
||||||
class HTMLInputStreamTest < Test::Unit::TestCase
|
class HTMLInputStreamTest < Test::Unit::TestCase
|
||||||
include HTML5lib
|
include HTML5
|
||||||
|
|
||||||
def test_char_ascii
|
def test_char_ascii
|
||||||
stream = HTMLInputStream.new("'", :encoding=>'ascii')
|
stream = HTMLInputStream.new("'", :encoding=>'ascii')
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
require File.join(File.dirname(__FILE__), 'preamble')
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
require 'html5lib/tokenizer'
|
require 'html5/tokenizer'
|
||||||
|
|
||||||
require 'tokenizer_test_parser'
|
require 'tokenizer_test_parser'
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ class Html5TokenizerTestCase < Test::Unit::TestCase
|
||||||
'' ] * "\n"
|
'' ] * "\n"
|
||||||
|
|
||||||
assert_nothing_raised message do
|
assert_nothing_raised message do
|
||||||
tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
|
tokenizer = HTML5::HTMLTokenizer.new(data['input'])
|
||||||
|
|
||||||
tokenizer.contentModelFlag = content_model_flag.to_sym
|
tokenizer.contentModelFlag = content_model_flag.to_sym
|
||||||
|
|
||||||
|
@ -53,7 +53,7 @@ class Html5TokenizerTestCase < Test::Unit::TestCase
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
html5lib_test_files('tokenizer').each do |test_file|
|
html5_test_files('tokenizer').each do |test_file|
|
||||||
test_name = File.basename(test_file).sub('.test', '')
|
test_name = File.basename(test_file).sub('.test', '')
|
||||||
|
|
||||||
tests = JSON.parse(File.read(test_file))['tests']
|
tests = JSON.parse(File.read(test_file))['tests']
|
||||||
|
|
|
@ -1,25 +1,25 @@
|
||||||
require File.join(File.dirname(__FILE__), 'preamble')
|
require File.join(File.dirname(__FILE__), 'preamble')
|
||||||
|
|
||||||
require 'html5lib/html5parser'
|
require 'html5/html5parser'
|
||||||
require 'html5lib/treewalkers'
|
require 'html5/treewalkers'
|
||||||
require 'html5lib/treebuilders'
|
require 'html5/treebuilders'
|
||||||
|
|
||||||
$tree_types_to_test = {
|
$tree_types_to_test = {
|
||||||
'simpletree' =>
|
'simpletree' =>
|
||||||
{:builder => HTML5lib::TreeBuilders['simpletree'],
|
{:builder => HTML5::TreeBuilders['simpletree'],
|
||||||
:walker => HTML5lib::TreeWalkers['simpletree']},
|
:walker => HTML5::TreeWalkers['simpletree']},
|
||||||
'rexml' =>
|
'rexml' =>
|
||||||
{:builder => HTML5lib::TreeBuilders['rexml'],
|
{:builder => HTML5::TreeBuilders['rexml'],
|
||||||
:walker => HTML5lib::TreeWalkers['rexml']},
|
:walker => HTML5::TreeWalkers['rexml']},
|
||||||
'hpricot' =>
|
'hpricot' =>
|
||||||
{:builder => HTML5lib::TreeBuilders['hpricot'],
|
{:builder => HTML5::TreeBuilders['hpricot'],
|
||||||
:walker => HTML5lib::TreeWalkers['hpricot']},
|
:walker => HTML5::TreeWalkers['hpricot']},
|
||||||
}
|
}
|
||||||
|
|
||||||
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
|
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
|
||||||
|
|
||||||
class TestTreeWalkers < Test::Unit::TestCase
|
class TestTreeWalkers < Test::Unit::TestCase
|
||||||
include HTML5lib::TestSupport
|
include HTML5::TestSupport
|
||||||
|
|
||||||
def concatenateCharacterTokens(tokens)
|
def concatenateCharacterTokens(tokens)
|
||||||
charactersToken = nil
|
charactersToken = nil
|
||||||
|
@ -70,22 +70,21 @@ class TestTreeWalkers < Test::Unit::TestCase
|
||||||
return output.join("\n")
|
return output.join("\n")
|
||||||
end
|
end
|
||||||
|
|
||||||
html5lib_test_files('tree-construction').each do |test_file|
|
html5_test_files('tree-construction').each do |test_file|
|
||||||
|
|
||||||
test_name = File.basename(test_file).sub('.dat', '')
|
test_name = File.basename(test_file).sub('.dat', '')
|
||||||
next if test_name == 'tests5' # TODO
|
next if test_name == 'tests5' # TODO
|
||||||
|
|
||||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
TestData.new(test_file, %w(data errors document-fragment document)).
|
||||||
next if data.empty?
|
each_with_index do |(input, errors, innerHTML, expected), index|
|
||||||
|
|
||||||
innerHTML, input, expected_output, expected_errors =
|
expected = expected.gsub("\n| ","\n")[2..-1]
|
||||||
HTML5lib::TestSupport::parseTestcase(data)
|
|
||||||
|
|
||||||
$tree_types_to_test.each do |tree_name, tree_class|
|
$tree_types_to_test.each do |tree_name, tree_class|
|
||||||
|
|
||||||
define_method "test_#{test_name}_#{index}_#{tree_name}" do
|
define_method "test_#{test_name}_#{index}_#{tree_name}" do
|
||||||
|
|
||||||
parser = HTML5lib::HTMLParser.new(:tree => tree_class[:builder])
|
parser = HTML5::HTMLParser.new(:tree => tree_class[:builder])
|
||||||
|
|
||||||
if innerHTML
|
if innerHTML
|
||||||
parser.parseFragment(input, innerHTML)
|
parser.parseFragment(input, innerHTML)
|
||||||
|
@ -97,7 +96,7 @@ class TestTreeWalkers < Test::Unit::TestCase
|
||||||
|
|
||||||
begin
|
begin
|
||||||
output = sortattrs(convertTokens(tree_class[:walker].new(document)))
|
output = sortattrs(convertTokens(tree_class[:walker].new(document)))
|
||||||
expected = sortattrs(expected_output)
|
expected = sortattrs(expected)
|
||||||
assert_equal expected, output, [
|
assert_equal expected, output, [
|
||||||
'', 'Input:', input,
|
'', 'Input:', input,
|
||||||
'', 'Expected:', expected,
|
'', 'Expected:', expected,
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
require 'html5lib/constants'
|
require 'html5/constants'
|
||||||
|
|
||||||
class TokenizerTestParser
|
class TokenizerTestParser
|
||||||
def initialize(tokenizer)
|
def initialize(tokenizer)
|
||||||
|
@ -27,7 +27,7 @@ class TokenizerTestParser
|
||||||
end
|
end
|
||||||
|
|
||||||
def processEmptyTag(token)
|
def processEmptyTag(token)
|
||||||
if not HTML5lib::VOID_ELEMENTS.include? token[:name]
|
if not HTML5::VOID_ELEMENTS.include? token[:name]
|
||||||
@outputTokens.push("ParseError")
|
@outputTokens.push("ParseError")
|
||||||
end
|
end
|
||||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||||
|
|
|
@ -28,6 +28,7 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
|
||||||
|
|
||||||
class LineSource
|
class LineSource
|
||||||
include MaRuKu::Strings
|
include MaRuKu::Strings
|
||||||
|
attr_reader :parent
|
||||||
|
|
||||||
def initialize(lines, parent=nil, parent_offset=nil)
|
def initialize(lines, parent=nil, parent_offset=nil)
|
||||||
raise "NIL lines? " if not lines
|
raise "NIL lines? " if not lines
|
||||||
|
|
|
@ -65,22 +65,8 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
|
||||||
when :ald
|
when :ald
|
||||||
output.push read_ald(src)
|
output.push read_ald(src)
|
||||||
when :text
|
when :text
|
||||||
if src.cur_line =~ MightBeTableHeader and
|
# paragraph, or table, or definition list
|
||||||
(src.next_line && src.next_line =~ TableSeparator)
|
read_text_material(src, output)
|
||||||
output.push read_table(src)
|
|
||||||
elsif [:header1,:header2].include? src.next_line.md_type
|
|
||||||
output.push read_header12(src)
|
|
||||||
elsif eventually_comes_a_def_list(src)
|
|
||||||
definition = read_definition(src)
|
|
||||||
if output.last.kind_of?(MDElement) &&
|
|
||||||
output.last.node_type == :definition_list then
|
|
||||||
output.last.children << definition
|
|
||||||
else
|
|
||||||
output.push md_el(:definition_list, [definition])
|
|
||||||
end
|
|
||||||
else # Start of a paragraph
|
|
||||||
output.push read_paragraph(src)
|
|
||||||
end
|
|
||||||
when :header2, :hrule
|
when :header2, :hrule
|
||||||
# hrule
|
# hrule
|
||||||
src.shift_line
|
src.shift_line
|
||||||
|
@ -102,7 +88,12 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
|
||||||
when :raw_html; e = read_raw_html(src); output << e if e
|
when :raw_html; e = read_raw_html(src); output << e if e
|
||||||
|
|
||||||
when :footnote_text; output.push read_footnote_text(src)
|
when :footnote_text; output.push read_footnote_text(src)
|
||||||
when :ref_definition; read_ref_definition(src, output)
|
when :ref_definition;
|
||||||
|
if src.parent && (src.cur_index == 0)
|
||||||
|
read_text_material(src, output)
|
||||||
|
else
|
||||||
|
read_ref_definition(src, output)
|
||||||
|
end
|
||||||
when :abbreviation; output.push read_abbreviation(src)
|
when :abbreviation; output.push read_abbreviation(src)
|
||||||
when :xml_instr; read_xml_instruction(src, output)
|
when :xml_instr; read_xml_instruction(src, output)
|
||||||
when :metadata;
|
when :metadata;
|
||||||
|
@ -149,6 +140,24 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
|
||||||
output
|
output
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def read_text_material(src, output)
|
||||||
|
if src.cur_line =~ MightBeTableHeader and
|
||||||
|
(src.next_line && src.next_line =~ TableSeparator)
|
||||||
|
output.push read_table(src)
|
||||||
|
elsif [:header1,:header2].include? src.next_line.md_type
|
||||||
|
output.push read_header12(src)
|
||||||
|
elsif eventually_comes_a_def_list(src)
|
||||||
|
definition = read_definition(src)
|
||||||
|
if output.last.kind_of?(MDElement) &&
|
||||||
|
output.last.node_type == :definition_list then
|
||||||
|
output.last.children << definition
|
||||||
|
else
|
||||||
|
output.push md_el(:definition_list, [definition])
|
||||||
|
end
|
||||||
|
else # Start of a paragraph
|
||||||
|
output.push read_paragraph(src)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
def read_ald(src)
|
def read_ald(src)
|
||||||
|
@ -274,9 +283,9 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
|
||||||
item_type = src.cur_line.md_type
|
item_type = src.cur_line.md_type
|
||||||
first = src.shift_line
|
first = src.shift_line
|
||||||
|
|
||||||
# Ugly things going on inside `read_indented_content`
|
|
||||||
indentation = spaces_before_first_char(first)
|
indentation = spaces_before_first_char(first)
|
||||||
break_list = [:ulist, :olist, :ial]
|
break_list = [:ulist, :olist, :ial]
|
||||||
|
# Ugly things going on inside `read_indented_content`
|
||||||
lines, want_my_paragraph =
|
lines, want_my_paragraph =
|
||||||
read_indented_content(src,indentation, break_list, item_type)
|
read_indented_content(src,indentation, break_list, item_type)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue