Sync with latest HTML5lib and latest Maruku
This commit is contained in:
parent
8e92e4a3ab
commit
8ccaad85a5
|
@ -25,14 +25,14 @@
|
|||
|
||||
module Sanitize
|
||||
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5lib/liberalxmlparser'
|
||||
require 'html5lib/treewalkers'
|
||||
require 'html5lib/treebuilders'
|
||||
require 'html5lib/serializer'
|
||||
require 'html5lib/sanitizer'
|
||||
require 'html5/html5parser'
|
||||
require 'html5/liberalxmlparser'
|
||||
require 'html5/treewalkers'
|
||||
require 'html5/treebuilders'
|
||||
require 'html5/serializer'
|
||||
require 'html5/sanitizer'
|
||||
|
||||
include HTML5lib
|
||||
include HTML5
|
||||
|
||||
# Sanitize a string, parsed using XHTML parsing rules.
|
||||
#
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
require 'html5lib/html5parser'
|
||||
|
||||
module HTML5lib
|
||||
def self.parse(stream, options={})
|
||||
HTMLParser.parse(stream, options)
|
||||
end
|
||||
|
||||
def self.parseFragment(stream, options={})
|
||||
HTMLParser.parse(stream, options)
|
||||
end
|
||||
end
|
||||
require 'html5/html5parser'
|
||||
|
||||
module HTML5
|
||||
def self.parse(stream, options={})
|
||||
HTMLParser.parse(stream, options)
|
||||
end
|
||||
|
||||
def self.parseFragment(stream, options={})
|
||||
HTMLParser.parse(stream, options)
|
||||
end
|
||||
end
|
817
vendor/plugins/HTML5lib/lib/html5/constants.rb
vendored
Executable file
817
vendor/plugins/HTML5lib/lib/html5/constants.rb
vendored
Executable file
|
@ -0,0 +1,817 @@
|
|||
module HTML5
|
||||
|
||||
class EOF < Exception; end
|
||||
|
||||
CONTENT_MODEL_FLAGS = [
|
||||
:PCDATA,
|
||||
:RCDATA,
|
||||
:CDATA,
|
||||
:PLAINTEXT
|
||||
]
|
||||
|
||||
SCOPING_ELEMENTS = %w[
|
||||
button
|
||||
caption
|
||||
html
|
||||
marquee
|
||||
object
|
||||
table
|
||||
td
|
||||
th
|
||||
]
|
||||
|
||||
FORMATTING_ELEMENTS = %w[
|
||||
a
|
||||
b
|
||||
big
|
||||
em
|
||||
font
|
||||
i
|
||||
nobr
|
||||
s
|
||||
small
|
||||
strike
|
||||
strong
|
||||
tt
|
||||
u
|
||||
]
|
||||
|
||||
SPECIAL_ELEMENTS = %w[
|
||||
address
|
||||
area
|
||||
base
|
||||
basefont
|
||||
bgsound
|
||||
blockquote
|
||||
body
|
||||
br
|
||||
center
|
||||
col
|
||||
colgroup
|
||||
dd
|
||||
dir
|
||||
div
|
||||
dl
|
||||
dt
|
||||
embed
|
||||
fieldset
|
||||
form
|
||||
frame
|
||||
frameset
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
h4
|
||||
h5
|
||||
h6
|
||||
head
|
||||
hr
|
||||
iframe
|
||||
image
|
||||
img
|
||||
input
|
||||
isindex
|
||||
li
|
||||
link
|
||||
listing
|
||||
menu
|
||||
meta
|
||||
noembed
|
||||
noframes
|
||||
noscript
|
||||
ol
|
||||
optgroup
|
||||
option
|
||||
p
|
||||
param
|
||||
plaintext
|
||||
pre
|
||||
script
|
||||
select
|
||||
spacer
|
||||
style
|
||||
tbody
|
||||
textarea
|
||||
tfoot
|
||||
thead
|
||||
title
|
||||
tr
|
||||
ul
|
||||
wbr
|
||||
]
|
||||
|
||||
SPACE_CHARACTERS = %W[
|
||||
\t
|
||||
\n
|
||||
\x0B
|
||||
\x0C
|
||||
\x20
|
||||
\r
|
||||
]
|
||||
|
||||
TABLE_INSERT_MODE_ELEMENTS = %w[
|
||||
table
|
||||
tbody
|
||||
tfoot
|
||||
thead
|
||||
tr
|
||||
]
|
||||
|
||||
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
|
||||
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
|
||||
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
|
||||
DIGITS = '0'..'9'
|
||||
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
|
||||
|
||||
# Heading elements need to be ordered
|
||||
HEADING_ELEMENTS = %w[
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
h4
|
||||
h5
|
||||
h6
|
||||
]
|
||||
|
||||
# XXX What about event-source and command?
|
||||
VOID_ELEMENTS = %w[
|
||||
base
|
||||
link
|
||||
meta
|
||||
hr
|
||||
br
|
||||
img
|
||||
embed
|
||||
param
|
||||
area
|
||||
col
|
||||
input
|
||||
]
|
||||
|
||||
CDATA_ELEMENTS = %w[title textarea]
|
||||
|
||||
RCDATA_ELEMENTS = %w[
|
||||
style
|
||||
script
|
||||
xmp
|
||||
iframe
|
||||
noembed
|
||||
noframes
|
||||
noscript
|
||||
]
|
||||
|
||||
BOOLEAN_ATTRIBUTES = {
|
||||
:global => %w[irrelevant],
|
||||
'style' => %w[scoped],
|
||||
'img' => %w[ismap],
|
||||
'audio' => %w[autoplay controls],
|
||||
'video' => %w[autoplay controls],
|
||||
'script' => %w[defer async],
|
||||
'details' => %w[open],
|
||||
'datagrid' => %w[multiple disabled],
|
||||
'command' => %w[hidden disabled checked default],
|
||||
'menu' => %w[autosubmit],
|
||||
'fieldset' => %w[disabled readonly],
|
||||
'option' => %w[disabled readonly selected],
|
||||
'optgroup' => %w[disabled readonly],
|
||||
'button' => %w[disabled autofocus],
|
||||
'input' => %w[disabled readonly required autofocus checked ismap],
|
||||
'select' => %w[disabled readonly autofocus multiple],
|
||||
'output' => %w[disabled readonly]
|
||||
}
|
||||
|
||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
|
||||
ENTITIES_WINDOWS1252 = [
|
||||
8364, # 0x80 0x20AC EURO SIGN
|
||||
65533, # 0x81 UNDEFINED
|
||||
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
||||
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
||||
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
||||
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
||||
8224, # 0x86 0x2020 DAGGER
|
||||
8225, # 0x87 0x2021 DOUBLE DAGGER
|
||||
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||
8240, # 0x89 0x2030 PER MILLE SIGN
|
||||
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
||||
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
||||
65533, # 0x8D UNDEFINED
|
||||
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
||||
65533, # 0x8F UNDEFINED
|
||||
65533, # 0x90 UNDEFINED
|
||||
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
||||
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
||||
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
||||
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
||||
8226, # 0x95 0x2022 BULLET
|
||||
8211, # 0x96 0x2013 EN DASH
|
||||
8212, # 0x97 0x2014 EM DASH
|
||||
732, # 0x98 0x02DC SMALL TILDE
|
||||
8482, # 0x99 0x2122 TRADE MARK SIGN
|
||||
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
||||
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
||||
65533, # 0x9D UNDEFINED
|
||||
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
||||
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
]
|
||||
|
||||
# ENTITIES was generated from Python using the following code:
|
||||
#
|
||||
# import constants
|
||||
# entities = constants.entities.items()
|
||||
# entities.sort()
|
||||
# list = [ ' '.join([repr(entity), '=>', ord(value)<128 and
|
||||
# repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
|
||||
# for entity, value in entities]
|
||||
# print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }'
|
||||
|
||||
ENTITIES = {
|
||||
'AElig' => "\xc3\x86",
|
||||
'AElig;' => "\xc3\x86",
|
||||
'AMP' => '&',
|
||||
'AMP;' => '&',
|
||||
'Aacute' => "\xc3\x81",
|
||||
'Aacute;' => "\xc3\x81",
|
||||
'Acirc' => "\xc3\x82",
|
||||
'Acirc;' => "\xc3\x82",
|
||||
'Agrave' => "\xc3\x80",
|
||||
'Agrave;' => "\xc3\x80",
|
||||
'Alpha;' => "\xce\x91",
|
||||
'Aring' => "\xc3\x85",
|
||||
'Aring;' => "\xc3\x85",
|
||||
'Atilde' => "\xc3\x83",
|
||||
'Atilde;' => "\xc3\x83",
|
||||
'Auml' => "\xc3\x84",
|
||||
'Auml;' => "\xc3\x84",
|
||||
'Beta;' => "\xce\x92",
|
||||
'COPY' => "\xc2\xa9",
|
||||
'COPY;' => "\xc2\xa9",
|
||||
'Ccedil' => "\xc3\x87",
|
||||
'Ccedil;' => "\xc3\x87",
|
||||
'Chi;' => "\xce\xa7",
|
||||
'Dagger;' => "\xe2\x80\xa1",
|
||||
'Delta;' => "\xce\x94",
|
||||
'ETH' => "\xc3\x90",
|
||||
'ETH;' => "\xc3\x90",
|
||||
'Eacute' => "\xc3\x89",
|
||||
'Eacute;' => "\xc3\x89",
|
||||
'Ecirc' => "\xc3\x8a",
|
||||
'Ecirc;' => "\xc3\x8a",
|
||||
'Egrave' => "\xc3\x88",
|
||||
'Egrave;' => "\xc3\x88",
|
||||
'Epsilon;' => "\xce\x95",
|
||||
'Eta;' => "\xce\x97",
|
||||
'Euml' => "\xc3\x8b",
|
||||
'Euml;' => "\xc3\x8b",
|
||||
'GT' => '>',
|
||||
'GT;' => '>',
|
||||
'Gamma;' => "\xce\x93",
|
||||
'Iacute' => "\xc3\x8d",
|
||||
'Iacute;' => "\xc3\x8d",
|
||||
'Icirc' => "\xc3\x8e",
|
||||
'Icirc;' => "\xc3\x8e",
|
||||
'Igrave' => "\xc3\x8c",
|
||||
'Igrave;' => "\xc3\x8c",
|
||||
'Iota;' => "\xce\x99",
|
||||
'Iuml' => "\xc3\x8f",
|
||||
'Iuml;' => "\xc3\x8f",
|
||||
'Kappa;' => "\xce\x9a",
|
||||
'LT' => '<',
|
||||
'LT;' => '<',
|
||||
'Lambda;' => "\xce\x9b",
|
||||
'Mu;' => "\xce\x9c",
|
||||
'Ntilde' => "\xc3\x91",
|
||||
'Ntilde;' => "\xc3\x91",
|
||||
'Nu;' => "\xce\x9d",
|
||||
'OElig;' => "\xc5\x92",
|
||||
'Oacute' => "\xc3\x93",
|
||||
'Oacute;' => "\xc3\x93",
|
||||
'Ocirc' => "\xc3\x94",
|
||||
'Ocirc;' => "\xc3\x94",
|
||||
'Ograve' => "\xc3\x92",
|
||||
'Ograve;' => "\xc3\x92",
|
||||
'Omega;' => "\xce\xa9",
|
||||
'Omicron;' => "\xce\x9f",
|
||||
'Oslash' => "\xc3\x98",
|
||||
'Oslash;' => "\xc3\x98",
|
||||
'Otilde' => "\xc3\x95",
|
||||
'Otilde;' => "\xc3\x95",
|
||||
'Ouml' => "\xc3\x96",
|
||||
'Ouml;' => "\xc3\x96",
|
||||
'Phi;' => "\xce\xa6",
|
||||
'Pi;' => "\xce\xa0",
|
||||
'Prime;' => "\xe2\x80\xb3",
|
||||
'Psi;' => "\xce\xa8",
|
||||
'QUOT' => '"',
|
||||
'QUOT;' => '"',
|
||||
'REG' => "\xc2\xae",
|
||||
'REG;' => "\xc2\xae",
|
||||
'Rho;' => "\xce\xa1",
|
||||
'Scaron;' => "\xc5\xa0",
|
||||
'Sigma;' => "\xce\xa3",
|
||||
'THORN' => "\xc3\x9e",
|
||||
'THORN;' => "\xc3\x9e",
|
||||
'TRADE;' => "\xe2\x84\xa2",
|
||||
'Tau;' => "\xce\xa4",
|
||||
'Theta;' => "\xce\x98",
|
||||
'Uacute' => "\xc3\x9a",
|
||||
'Uacute;' => "\xc3\x9a",
|
||||
'Ucirc' => "\xc3\x9b",
|
||||
'Ucirc;' => "\xc3\x9b",
|
||||
'Ugrave' => "\xc3\x99",
|
||||
'Ugrave;' => "\xc3\x99",
|
||||
'Upsilon;' => "\xce\xa5",
|
||||
'Uuml' => "\xc3\x9c",
|
||||
'Uuml;' => "\xc3\x9c",
|
||||
'Xi;' => "\xce\x9e",
|
||||
'Yacute' => "\xc3\x9d",
|
||||
'Yacute;' => "\xc3\x9d",
|
||||
'Yuml;' => "\xc5\xb8",
|
||||
'Zeta;' => "\xce\x96",
|
||||
'aacute' => "\xc3\xa1",
|
||||
'aacute;' => "\xc3\xa1",
|
||||
'acirc' => "\xc3\xa2",
|
||||
'acirc;' => "\xc3\xa2",
|
||||
'acute' => "\xc2\xb4",
|
||||
'acute;' => "\xc2\xb4",
|
||||
'aelig' => "\xc3\xa6",
|
||||
'aelig;' => "\xc3\xa6",
|
||||
'agrave' => "\xc3\xa0",
|
||||
'agrave;' => "\xc3\xa0",
|
||||
'alefsym;' => "\xe2\x84\xb5",
|
||||
'alpha;' => "\xce\xb1",
|
||||
'amp' => '&',
|
||||
'amp;' => '&',
|
||||
'and;' => "\xe2\x88\xa7",
|
||||
'ang;' => "\xe2\x88\xa0",
|
||||
'apos;' => "'",
|
||||
'aring' => "\xc3\xa5",
|
||||
'aring;' => "\xc3\xa5",
|
||||
'asymp;' => "\xe2\x89\x88",
|
||||
'atilde' => "\xc3\xa3",
|
||||
'atilde;' => "\xc3\xa3",
|
||||
'auml' => "\xc3\xa4",
|
||||
'auml;' => "\xc3\xa4",
|
||||
'bdquo;' => "\xe2\x80\x9e",
|
||||
'beta;' => "\xce\xb2",
|
||||
'brvbar' => "\xc2\xa6",
|
||||
'brvbar;' => "\xc2\xa6",
|
||||
'bull;' => "\xe2\x80\xa2",
|
||||
'cap;' => "\xe2\x88\xa9",
|
||||
'ccedil' => "\xc3\xa7",
|
||||
'ccedil;' => "\xc3\xa7",
|
||||
'cedil' => "\xc2\xb8",
|
||||
'cedil;' => "\xc2\xb8",
|
||||
'cent' => "\xc2\xa2",
|
||||
'cent;' => "\xc2\xa2",
|
||||
'chi;' => "\xcf\x87",
|
||||
'circ;' => "\xcb\x86",
|
||||
'clubs;' => "\xe2\x99\xa3",
|
||||
'cong;' => "\xe2\x89\x85",
|
||||
'copy' => "\xc2\xa9",
|
||||
'copy;' => "\xc2\xa9",
|
||||
'crarr;' => "\xe2\x86\xb5",
|
||||
'cup;' => "\xe2\x88\xaa",
|
||||
'curren' => "\xc2\xa4",
|
||||
'curren;' => "\xc2\xa4",
|
||||
'dArr;' => "\xe2\x87\x93",
|
||||
'dagger;' => "\xe2\x80\xa0",
|
||||
'darr;' => "\xe2\x86\x93",
|
||||
'deg' => "\xc2\xb0",
|
||||
'deg;' => "\xc2\xb0",
|
||||
'delta;' => "\xce\xb4",
|
||||
'diams;' => "\xe2\x99\xa6",
|
||||
'divide' => "\xc3\xb7",
|
||||
'divide;' => "\xc3\xb7",
|
||||
'eacute' => "\xc3\xa9",
|
||||
'eacute;' => "\xc3\xa9",
|
||||
'ecirc' => "\xc3\xaa",
|
||||
'ecirc;' => "\xc3\xaa",
|
||||
'egrave' => "\xc3\xa8",
|
||||
'egrave;' => "\xc3\xa8",
|
||||
'empty;' => "\xe2\x88\x85",
|
||||
'emsp;' => "\xe2\x80\x83",
|
||||
'ensp;' => "\xe2\x80\x82",
|
||||
'epsilon;' => "\xce\xb5",
|
||||
'equiv;' => "\xe2\x89\xa1",
|
||||
'eta;' => "\xce\xb7",
|
||||
'eth' => "\xc3\xb0",
|
||||
'eth;' => "\xc3\xb0",
|
||||
'euml' => "\xc3\xab",
|
||||
'euml;' => "\xc3\xab",
|
||||
'euro;' => "\xe2\x82\xac",
|
||||
'exist;' => "\xe2\x88\x83",
|
||||
'fnof;' => "\xc6\x92",
|
||||
'forall;' => "\xe2\x88\x80",
|
||||
'frac12' => "\xc2\xbd",
|
||||
'frac12;' => "\xc2\xbd",
|
||||
'frac14' => "\xc2\xbc",
|
||||
'frac14;' => "\xc2\xbc",
|
||||
'frac34' => "\xc2\xbe",
|
||||
'frac34;' => "\xc2\xbe",
|
||||
'frasl;' => "\xe2\x81\x84",
|
||||
'gamma;' => "\xce\xb3",
|
||||
'ge;' => "\xe2\x89\xa5",
|
||||
'gt' => '>',
|
||||
'gt;' => '>',
|
||||
'hArr;' => "\xe2\x87\x94",
|
||||
'harr;' => "\xe2\x86\x94",
|
||||
'hearts;' => "\xe2\x99\xa5",
|
||||
'hellip;' => "\xe2\x80\xa6",
|
||||
'iacute' => "\xc3\xad",
|
||||
'iacute;' => "\xc3\xad",
|
||||
'icirc' => "\xc3\xae",
|
||||
'icirc;' => "\xc3\xae",
|
||||
'iexcl' => "\xc2\xa1",
|
||||
'iexcl;' => "\xc2\xa1",
|
||||
'igrave' => "\xc3\xac",
|
||||
'igrave;' => "\xc3\xac",
|
||||
'image;' => "\xe2\x84\x91",
|
||||
'infin;' => "\xe2\x88\x9e",
|
||||
'int;' => "\xe2\x88\xab",
|
||||
'iota;' => "\xce\xb9",
|
||||
'iquest' => "\xc2\xbf",
|
||||
'iquest;' => "\xc2\xbf",
|
||||
'isin;' => "\xe2\x88\x88",
|
||||
'iuml' => "\xc3\xaf",
|
||||
'iuml;' => "\xc3\xaf",
|
||||
'kappa;' => "\xce\xba",
|
||||
'lArr;' => "\xe2\x87\x90",
|
||||
'lambda;' => "\xce\xbb",
|
||||
'lang;' => "\xe3\x80\x88",
|
||||
'laquo' => "\xc2\xab",
|
||||
'laquo;' => "\xc2\xab",
|
||||
'larr;' => "\xe2\x86\x90",
|
||||
'lceil;' => "\xe2\x8c\x88",
|
||||
'ldquo;' => "\xe2\x80\x9c",
|
||||
'le;' => "\xe2\x89\xa4",
|
||||
'lfloor;' => "\xe2\x8c\x8a",
|
||||
'lowast;' => "\xe2\x88\x97",
|
||||
'loz;' => "\xe2\x97\x8a",
|
||||
'lrm;' => "\xe2\x80\x8e",
|
||||
'lsaquo;' => "\xe2\x80\xb9",
|
||||
'lsquo;' => "\xe2\x80\x98",
|
||||
'lt' => '<',
|
||||
'lt;' => '<',
|
||||
'macr' => "\xc2\xaf",
|
||||
'macr;' => "\xc2\xaf",
|
||||
'mdash;' => "\xe2\x80\x94",
|
||||
'micro' => "\xc2\xb5",
|
||||
'micro;' => "\xc2\xb5",
|
||||
'middot' => "\xc2\xb7",
|
||||
'middot;' => "\xc2\xb7",
|
||||
'minus;' => "\xe2\x88\x92",
|
||||
'mu;' => "\xce\xbc",
|
||||
'nabla;' => "\xe2\x88\x87",
|
||||
'nbsp' => "\xc2\xa0",
|
||||
'nbsp;' => "\xc2\xa0",
|
||||
'ndash;' => "\xe2\x80\x93",
|
||||
'ne;' => "\xe2\x89\xa0",
|
||||
'ni;' => "\xe2\x88\x8b",
|
||||
'not' => "\xc2\xac",
|
||||
'not;' => "\xc2\xac",
|
||||
'notin;' => "\xe2\x88\x89",
|
||||
'nsub;' => "\xe2\x8a\x84",
|
||||
'ntilde' => "\xc3\xb1",
|
||||
'ntilde;' => "\xc3\xb1",
|
||||
'nu;' => "\xce\xbd",
|
||||
'oacute' => "\xc3\xb3",
|
||||
'oacute;' => "\xc3\xb3",
|
||||
'ocirc' => "\xc3\xb4",
|
||||
'ocirc;' => "\xc3\xb4",
|
||||
'oelig;' => "\xc5\x93",
|
||||
'ograve' => "\xc3\xb2",
|
||||
'ograve;' => "\xc3\xb2",
|
||||
'oline;' => "\xe2\x80\xbe",
|
||||
'omega;' => "\xcf\x89",
|
||||
'omicron;' => "\xce\xbf",
|
||||
'oplus;' => "\xe2\x8a\x95",
|
||||
'or;' => "\xe2\x88\xa8",
|
||||
'ordf' => "\xc2\xaa",
|
||||
'ordf;' => "\xc2\xaa",
|
||||
'ordm' => "\xc2\xba",
|
||||
'ordm;' => "\xc2\xba",
|
||||
'oslash' => "\xc3\xb8",
|
||||
'oslash;' => "\xc3\xb8",
|
||||
'otilde' => "\xc3\xb5",
|
||||
'otilde;' => "\xc3\xb5",
|
||||
'otimes;' => "\xe2\x8a\x97",
|
||||
'ouml' => "\xc3\xb6",
|
||||
'ouml;' => "\xc3\xb6",
|
||||
'para' => "\xc2\xb6",
|
||||
'para;' => "\xc2\xb6",
|
||||
'part;' => "\xe2\x88\x82",
|
||||
'permil;' => "\xe2\x80\xb0",
|
||||
'perp;' => "\xe2\x8a\xa5",
|
||||
'phi;' => "\xcf\x86",
|
||||
'pi;' => "\xcf\x80",
|
||||
'piv;' => "\xcf\x96",
|
||||
'plusmn' => "\xc2\xb1",
|
||||
'plusmn;' => "\xc2\xb1",
|
||||
'pound' => "\xc2\xa3",
|
||||
'pound;' => "\xc2\xa3",
|
||||
'prime;' => "\xe2\x80\xb2",
|
||||
'prod;' => "\xe2\x88\x8f",
|
||||
'prop;' => "\xe2\x88\x9d",
|
||||
'psi;' => "\xcf\x88",
|
||||
'quot' => '"',
|
||||
'quot;' => '"',
|
||||
'rArr;' => "\xe2\x87\x92",
|
||||
'radic;' => "\xe2\x88\x9a",
|
||||
'rang;' => "\xe3\x80\x89",
|
||||
'raquo' => "\xc2\xbb",
|
||||
'raquo;' => "\xc2\xbb",
|
||||
'rarr;' => "\xe2\x86\x92",
|
||||
'rceil;' => "\xe2\x8c\x89",
|
||||
'rdquo;' => "\xe2\x80\x9d",
|
||||
'real;' => "\xe2\x84\x9c",
|
||||
'reg' => "\xc2\xae",
|
||||
'reg;' => "\xc2\xae",
|
||||
'rfloor;' => "\xe2\x8c\x8b",
|
||||
'rho;' => "\xcf\x81",
|
||||
'rlm;' => "\xe2\x80\x8f",
|
||||
'rsaquo;' => "\xe2\x80\xba",
|
||||
'rsquo;' => "\xe2\x80\x99",
|
||||
'sbquo;' => "\xe2\x80\x9a",
|
||||
'scaron;' => "\xc5\xa1",
|
||||
'sdot;' => "\xe2\x8b\x85",
|
||||
'sect' => "\xc2\xa7",
|
||||
'sect;' => "\xc2\xa7",
|
||||
'shy' => "\xc2\xad",
|
||||
'shy;' => "\xc2\xad",
|
||||
'sigma;' => "\xcf\x83",
|
||||
'sigmaf;' => "\xcf\x82",
|
||||
'sim;' => "\xe2\x88\xbc",
|
||||
'spades;' => "\xe2\x99\xa0",
|
||||
'sub;' => "\xe2\x8a\x82",
|
||||
'sube;' => "\xe2\x8a\x86",
|
||||
'sum;' => "\xe2\x88\x91",
|
||||
'sup1' => "\xc2\xb9",
|
||||
'sup1;' => "\xc2\xb9",
|
||||
'sup2' => "\xc2\xb2",
|
||||
'sup2;' => "\xc2\xb2",
|
||||
'sup3' => "\xc2\xb3",
|
||||
'sup3;' => "\xc2\xb3",
|
||||
'sup;' => "\xe2\x8a\x83",
|
||||
'supe;' => "\xe2\x8a\x87",
|
||||
'szlig' => "\xc3\x9f",
|
||||
'szlig;' => "\xc3\x9f",
|
||||
'tau;' => "\xcf\x84",
|
||||
'there4;' => "\xe2\x88\xb4",
|
||||
'theta;' => "\xce\xb8",
|
||||
'thetasym;' => "\xcf\x91",
|
||||
'thinsp;' => "\xe2\x80\x89",
|
||||
'thorn' => "\xc3\xbe",
|
||||
'thorn;' => "\xc3\xbe",
|
||||
'tilde;' => "\xcb\x9c",
|
||||
'times' => "\xc3\x97",
|
||||
'times;' => "\xc3\x97",
|
||||
'trade;' => "\xe2\x84\xa2",
|
||||
'uArr;' => "\xe2\x87\x91",
|
||||
'uacute' => "\xc3\xba",
|
||||
'uacute;' => "\xc3\xba",
|
||||
'uarr;' => "\xe2\x86\x91",
|
||||
'ucirc' => "\xc3\xbb",
|
||||
'ucirc;' => "\xc3\xbb",
|
||||
'ugrave' => "\xc3\xb9",
|
||||
'ugrave;' => "\xc3\xb9",
|
||||
'uml' => "\xc2\xa8",
|
||||
'uml;' => "\xc2\xa8",
|
||||
'upsih;' => "\xcf\x92",
|
||||
'upsilon;' => "\xcf\x85",
|
||||
'uuml' => "\xc3\xbc",
|
||||
'uuml;' => "\xc3\xbc",
|
||||
'weierp;' => "\xe2\x84\x98",
|
||||
'xi;' => "\xce\xbe",
|
||||
'yacute' => "\xc3\xbd",
|
||||
'yacute;' => "\xc3\xbd",
|
||||
'yen' => "\xc2\xa5",
|
||||
'yen;' => "\xc2\xa5",
|
||||
'yuml' => "\xc3\xbf",
|
||||
'yuml;' => "\xc3\xbf",
|
||||
'zeta;' => "\xce\xb6",
|
||||
'zwj;' => "\xe2\x80\x8d",
|
||||
'zwnj;' => "\xe2\x80\x8c"
|
||||
}
|
||||
|
||||
ENCODINGS = %w[
|
||||
ansi_x3.4-1968
|
||||
iso-ir-6
|
||||
ansi_x3.4-1986
|
||||
iso_646.irv:1991
|
||||
ascii
|
||||
iso646-us
|
||||
us-ascii
|
||||
us
|
||||
ibm367
|
||||
cp367
|
||||
csascii
|
||||
ks_c_5601-1987
|
||||
korean
|
||||
iso-2022-kr
|
||||
csiso2022kr
|
||||
euc-kr
|
||||
iso-2022-jp
|
||||
csiso2022jp
|
||||
iso-2022-jp-2
|
||||
iso-ir-58
|
||||
chinese
|
||||
csiso58gb231280
|
||||
iso_8859-1:1987
|
||||
iso-ir-100
|
||||
iso_8859-1
|
||||
iso-8859-1
|
||||
latin1
|
||||
l1
|
||||
ibm819
|
||||
cp819
|
||||
csisolatin1
|
||||
iso_8859-2:1987
|
||||
iso-ir-101
|
||||
iso_8859-2
|
||||
iso-8859-2
|
||||
latin2
|
||||
l2
|
||||
csisolatin2
|
||||
iso_8859-3:1988
|
||||
iso-ir-109
|
||||
iso_8859-3
|
||||
iso-8859-3
|
||||
latin3
|
||||
l3
|
||||
csisolatin3
|
||||
iso_8859-4:1988
|
||||
iso-ir-110
|
||||
iso_8859-4
|
||||
iso-8859-4
|
||||
latin4
|
||||
l4
|
||||
csisolatin4
|
||||
iso_8859-6:1987
|
||||
iso-ir-127
|
||||
iso_8859-6
|
||||
iso-8859-6
|
||||
ecma-114
|
||||
asmo-708
|
||||
arabic
|
||||
csisolatinarabic
|
||||
iso_8859-7:1987
|
||||
iso-ir-126
|
||||
iso_8859-7
|
||||
iso-8859-7
|
||||
elot_928
|
||||
ecma-118
|
||||
greek
|
||||
greek8
|
||||
csisolatingreek
|
||||
iso_8859-8:1988
|
||||
iso-ir-138
|
||||
iso_8859-8
|
||||
iso-8859-8
|
||||
hebrew
|
||||
csisolatinhebrew
|
||||
iso_8859-5:1988
|
||||
iso-ir-144
|
||||
iso_8859-5
|
||||
iso-8859-5
|
||||
cyrillic
|
||||
csisolatincyrillic
|
||||
iso_8859-9:1989
|
||||
iso-ir-148
|
||||
iso_8859-9
|
||||
iso-8859-9
|
||||
latin5
|
||||
l5
|
||||
csisolatin5
|
||||
iso-8859-10
|
||||
iso-ir-157
|
||||
l6
|
||||
iso_8859-10:1992
|
||||
csisolatin6
|
||||
latin6
|
||||
hp-roman8
|
||||
roman8
|
||||
r8
|
||||
ibm037
|
||||
cp037
|
||||
csibm037
|
||||
ibm424
|
||||
cp424
|
||||
csibm424
|
||||
ibm437
|
||||
cp437
|
||||
437
|
||||
cspc8codepage437
|
||||
ibm500
|
||||
cp500
|
||||
csibm500
|
||||
ibm775
|
||||
cp775
|
||||
cspc775baltic
|
||||
ibm850
|
||||
cp850
|
||||
850
|
||||
cspc850multilingual
|
||||
ibm852
|
||||
cp852
|
||||
852
|
||||
cspcp852
|
||||
ibm855
|
||||
cp855
|
||||
855
|
||||
csibm855
|
||||
ibm857
|
||||
cp857
|
||||
857
|
||||
csibm857
|
||||
ibm860
|
||||
cp860
|
||||
860
|
||||
csibm860
|
||||
ibm861
|
||||
cp861
|
||||
861
|
||||
cp-is
|
||||
csibm861
|
||||
ibm862
|
||||
cp862
|
||||
862
|
||||
cspc862latinhebrew
|
||||
ibm863
|
||||
cp863
|
||||
863
|
||||
csibm863
|
||||
ibm864
|
||||
cp864
|
||||
csibm864
|
||||
ibm865
|
||||
cp865
|
||||
865
|
||||
csibm865
|
||||
ibm866
|
||||
cp866
|
||||
866
|
||||
csibm866
|
||||
ibm869
|
||||
cp869
|
||||
869
|
||||
cp-gr
|
||||
csibm869
|
||||
ibm1026
|
||||
cp1026
|
||||
csibm1026
|
||||
koi8-r
|
||||
cskoi8r
|
||||
koi8-u
|
||||
big5-hkscs
|
||||
ptcp154
|
||||
csptcp154
|
||||
pt154
|
||||
cp154
|
||||
utf-7
|
||||
utf-16be
|
||||
utf-16le
|
||||
utf-16
|
||||
utf-8
|
||||
iso-8859-13
|
||||
iso-8859-14
|
||||
iso-ir-199
|
||||
iso_8859-14:1998
|
||||
iso_8859-14
|
||||
latin8
|
||||
iso-celtic
|
||||
l8
|
||||
iso-8859-15
|
||||
iso_8859-15
|
||||
iso-8859-16
|
||||
iso-ir-226
|
||||
iso_8859-16:2001
|
||||
iso_8859-16
|
||||
latin10
|
||||
l10
|
||||
gbk
|
||||
cp936
|
||||
ms936
|
||||
gb18030
|
||||
shift_jis
|
||||
ms_kanji
|
||||
csshiftjis
|
||||
euc-jp
|
||||
gb2312
|
||||
big5
|
||||
csbig5
|
||||
windows-1250
|
||||
windows-1251
|
||||
windows-1252
|
||||
windows-1253
|
||||
windows-1254
|
||||
windows-1255
|
||||
windows-1256
|
||||
windows-1257
|
||||
windows-1258
|
||||
tis-620
|
||||
hz-gb-2312
|
||||
]
|
||||
|
||||
end
|
1
vendor/plugins/HTML5lib/lib/html5/filters.rb
vendored
Normal file
1
vendor/plugins/HTML5lib/lib/html5/filters.rb
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
require 'html5/filters/optionaltags'
|
|
@ -1,7 +1,7 @@
|
|||
require 'delegate'
|
||||
require 'enumerator'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module Filters
|
||||
class Base < SimpleDelegator
|
||||
include Enumerable
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/filters/base'
|
||||
require 'html5/filters/base'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module Filters
|
||||
class InjectMetaCharset < Base
|
||||
def initialize(source, encoding)
|
|
@ -1,7 +1,7 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/filters/base'
|
||||
require 'html5/constants'
|
||||
require 'html5/filters/base'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module Filters
|
||||
|
||||
class OptionalTagFilter < Base
|
|
@ -1,7 +1,7 @@
|
|||
require 'html5lib/filters/base'
|
||||
require 'html5lib/sanitizer'
|
||||
require 'html5/filters/base'
|
||||
require 'html5/sanitizer'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module Filters
|
||||
class HTMLSanitizeFilter < Base
|
||||
include HTMLSanitizeModule
|
|
@ -1,7 +1,7 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/filters/base'
|
||||
require 'html5/constants'
|
||||
require 'html5/filters/base'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module Filters
|
||||
class WhitespaceFilter < Base
|
||||
|
|
@ -1,246 +1,246 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/tokenizer'
|
||||
require 'html5lib/treebuilders/rexml'
|
||||
|
||||
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
|
||||
require 'html5lib/html5parser/' + File.basename(path)
|
||||
end
|
||||
|
||||
module HTML5lib
|
||||
|
||||
# Error in parsed document
|
||||
class ParseError < Exception; end
|
||||
class AssertionError < Exception; end
|
||||
|
||||
# HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
|
||||
#
|
||||
class HTMLParser
|
||||
|
||||
attr_accessor :phase, :firstStartTag, :innerHTML, :lastPhase, :insertFromTable
|
||||
|
||||
attr_reader :phases, :tokenizer, :tree, :errors
|
||||
|
||||
def self.parse(stream, options = {})
|
||||
encoding = options.delete(:encoding)
|
||||
new(options).parse(stream,encoding)
|
||||
end
|
||||
|
||||
def self.parseFragment(stream, options = {})
|
||||
container = options.delete(:container) || 'div'
|
||||
encoding = options.delete(:encoding)
|
||||
new(options).parseFragment(stream,container,encoding)
|
||||
end
|
||||
|
||||
@@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
|
||||
inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
|
||||
|
||||
# :strict - raise an exception when a parse error is encountered
|
||||
# :tree - a treebuilder class controlling the type of tree that will be
|
||||
# returned. Built in treebuilders can be accessed through
|
||||
# HTML5lib::TreeBuilders[treeType]
|
||||
def initialize(options = {})
|
||||
@strict = false
|
||||
@errors = []
|
||||
|
||||
@tokenizer = HTMLTokenizer
|
||||
@tree = TreeBuilders::REXML::TreeBuilder
|
||||
|
||||
options.each { |name, value| instance_variable_set("@#{name}", value) }
|
||||
|
||||
@tree = @tree.new
|
||||
|
||||
@phases = @@phases.inject({}) do |phases, phase_name|
|
||||
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
|
||||
phases[phase_name.to_sym] = HTML5lib.const_get(phase_class_name).new(self, @tree)
|
||||
phases
|
||||
end
|
||||
end
|
||||
|
||||
def _parse(stream, innerHTML, encoding, container = 'div')
|
||||
@tree.reset
|
||||
@firstStartTag = false
|
||||
@errors = []
|
||||
|
||||
@tokenizer = @tokenizer.class unless Class === @tokenizer
|
||||
@tokenizer = @tokenizer.new(stream, :encoding => encoding,
|
||||
:parseMeta => !innerHTML)
|
||||
|
||||
if innerHTML
|
||||
case @innerHTML = container.downcase
|
||||
when 'title', 'textarea'
|
||||
@tokenizer.contentModelFlag = :RCDATA
|
||||
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
|
||||
@tokenizer.contentModelFlag = :CDATA
|
||||
when 'plaintext'
|
||||
@tokenizer.contentModelFlag = :PLAINTEXT
|
||||
else
|
||||
# contentModelFlag already is PCDATA
|
||||
#@tokenizer.contentModelFlag = :PCDATA
|
||||
end
|
||||
|
||||
@phase = @phases[:rootElement]
|
||||
@phase.insertHtmlElement
|
||||
resetInsertionMode
|
||||
else
|
||||
@innerHTML = false
|
||||
@phase = @phases[:initial]
|
||||
end
|
||||
|
||||
# We only seem to have InBodyPhase testcases where the following is
|
||||
# relevant ... need others too
|
||||
@lastPhase = nil
|
||||
|
||||
# XXX This is temporary for the moment so there isn't any other
|
||||
# changes needed for the parser to work with the iterable tokenizer
|
||||
@tokenizer.each do |token|
|
||||
token = normalizeToken(token)
|
||||
|
||||
method = 'process%s' % token[:type]
|
||||
|
||||
case token[:type]
|
||||
when :Characters, :SpaceCharacters, :Comment
|
||||
@phase.send method, token[:data]
|
||||
when :StartTag
|
||||
@phase.send method, token[:name], token[:data]
|
||||
when :EndTag
|
||||
@phase.send method, token[:name]
|
||||
when :Doctype
|
||||
@phase.send method, token[:name], token[:publicId],
|
||||
token[:systemId], token[:correct]
|
||||
else
|
||||
parseError(token[:data])
|
||||
end
|
||||
end
|
||||
|
||||
# When the loop finishes it's EOF
|
||||
@phase.processEOF
|
||||
end
|
||||
|
||||
# Parse a HTML document into a well-formed tree
|
||||
#
|
||||
# stream - a filelike object or string containing the HTML to be parsed
|
||||
#
|
||||
# The optional encoding parameter must be a string that indicates
|
||||
# the encoding. If specified, that encoding will be used,
|
||||
# regardless of any BOM or later declaration (such as in a meta
|
||||
# element)
|
||||
def parse(stream, encoding=nil)
|
||||
_parse(stream, false, encoding)
|
||||
return @tree.getDocument
|
||||
end
|
||||
|
||||
# Parse a HTML fragment into a well-formed tree fragment
|
||||
|
||||
# container - name of the element we're setting the innerHTML property
|
||||
# if set to nil, default to 'div'
|
||||
#
|
||||
# stream - a filelike object or string containing the HTML to be parsed
|
||||
#
|
||||
# The optional encoding parameter must be a string that indicates
|
||||
# the encoding. If specified, that encoding will be used,
|
||||
# regardless of any BOM or later declaration (such as in a meta
|
||||
# element)
|
||||
def parseFragment(stream, container='div', encoding=nil)
|
||||
_parse(stream, true, encoding, container)
|
||||
return @tree.getFragment
|
||||
end
|
||||
|
||||
def parseError(data = 'XXX ERROR MESSAGE NEEDED')
|
||||
# XXX The idea is to make data mandatory.
|
||||
@errors.push([@tokenizer.stream.position, data])
|
||||
raise ParseError if @strict
|
||||
end
|
||||
|
||||
# HTML5 specific normalizations to the token stream
|
||||
def normalizeToken(token)
|
||||
|
||||
if token[:type] == :EmptyTag
|
||||
# When a solidus (/) is encountered within a tag name what happens
|
||||
# depends on whether the current tag name matches that of a void
|
||||
# element. If it matches a void element atheists did the wrong
|
||||
# thing and if it doesn't it's wrong for everyone.
|
||||
|
||||
unless VOID_ELEMENTS.include?(token[:name])
|
||||
parseError(_('Solidus (/) incorrectly placed in tag.'))
|
||||
end
|
||||
|
||||
token[:type] = :StartTag
|
||||
end
|
||||
|
||||
if token[:type] == :StartTag
|
||||
token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
|
||||
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
unless token[:data].empty?
|
||||
data = token[:data].reverse.map { |attr, value| [attr.tr(ASCII_UPPERCASE, ASCII_LOWERCASE), value] }
|
||||
token[:data] = Hash[*data.flatten]
|
||||
end
|
||||
|
||||
elsif token[:type] == :EndTag
|
||||
parseError(_('End tag contains unexpected attributes.')) unless token[:data].empty?
|
||||
token[:name] = token[:name].downcase
|
||||
end
|
||||
|
||||
return token
|
||||
end
|
||||
|
||||
@@new_modes = {
|
||||
'select' => :inSelect,
|
||||
'td' => :inCell,
|
||||
'th' => :inCell,
|
||||
'tr' => :inRow,
|
||||
'tbody' => :inTableBody,
|
||||
'thead' => :inTableBody,
|
||||
'tfoot' => :inTableBody,
|
||||
'caption' => :inCaption,
|
||||
'colgroup' => :inColumnGroup,
|
||||
'table' => :inTable,
|
||||
'head' => :inBody,
|
||||
'body' => :inBody,
|
||||
'frameset' => :inFrameset
|
||||
}
|
||||
|
||||
def resetInsertionMode
|
||||
# The name of this method is mostly historical. (It's also used in the
|
||||
# specification.)
|
||||
last = false
|
||||
|
||||
@tree.openElements.reverse.each do |node|
|
||||
nodeName = node.name
|
||||
|
||||
if node == @tree.openElements[0]
|
||||
last = true
|
||||
unless ['td', 'th'].include?(nodeName)
|
||||
# XXX
|
||||
# assert @innerHTML
|
||||
nodeName = @innerHTML
|
||||
end
|
||||
end
|
||||
|
||||
# Check for conditions that should only happen in the innerHTML
|
||||
# case
|
||||
if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName)
|
||||
# XXX
|
||||
# assert @innerHTML
|
||||
end
|
||||
|
||||
if @@new_modes.has_key?(nodeName)
|
||||
@phase = @phases[@@new_modes[nodeName]]
|
||||
elsif nodeName == 'html'
|
||||
@phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead]
|
||||
elsif last
|
||||
@phase = @phases[:inBody]
|
||||
else
|
||||
next
|
||||
end
|
||||
|
||||
break
|
||||
end
|
||||
end
|
||||
|
||||
def _(string); string; end
|
||||
end
|
||||
|
||||
end
|
||||
require 'html5/constants'
|
||||
require 'html5/tokenizer'
|
||||
require 'html5/treebuilders/rexml'
|
||||
|
||||
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
|
||||
require 'html5/html5parser/' + File.basename(path)
|
||||
end
|
||||
|
||||
module HTML5
|
||||
|
||||
# Error in parsed document
|
||||
class ParseError < Exception; end
|
||||
class AssertionError < Exception; end
|
||||
|
||||
# HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
|
||||
#
|
||||
class HTMLParser
|
||||
|
||||
attr_accessor :phase, :firstStartTag, :innerHTML, :lastPhase, :insertFromTable
|
||||
|
||||
attr_reader :phases, :tokenizer, :tree, :errors
|
||||
|
||||
def self.parse(stream, options = {})
|
||||
encoding = options.delete(:encoding)
|
||||
new(options).parse(stream,encoding)
|
||||
end
|
||||
|
||||
def self.parseFragment(stream, options = {})
|
||||
container = options.delete(:container) || 'div'
|
||||
encoding = options.delete(:encoding)
|
||||
new(options).parseFragment(stream,container,encoding)
|
||||
end
|
||||
|
||||
@@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
|
||||
inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
|
||||
|
||||
# :strict - raise an exception when a parse error is encountered
|
||||
# :tree - a treebuilder class controlling the type of tree that will be
|
||||
# returned. Built in treebuilders can be accessed through
|
||||
# HTML5::TreeBuilders[treeType]
|
||||
def initialize(options = {})
|
||||
@strict = false
|
||||
@errors = []
|
||||
|
||||
@tokenizer = HTMLTokenizer
|
||||
@tree = TreeBuilders::REXML::TreeBuilder
|
||||
|
||||
options.each { |name, value| instance_variable_set("@#{name}", value) }
|
||||
|
||||
@tree = @tree.new
|
||||
|
||||
@phases = @@phases.inject({}) do |phases, phase_name|
|
||||
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
|
||||
phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
|
||||
phases
|
||||
end
|
||||
end
|
||||
|
||||
def _parse(stream, innerHTML, encoding, container = 'div')
|
||||
@tree.reset
|
||||
@firstStartTag = false
|
||||
@errors = []
|
||||
|
||||
@tokenizer = @tokenizer.class unless Class === @tokenizer
|
||||
@tokenizer = @tokenizer.new(stream, :encoding => encoding,
|
||||
:parseMeta => !innerHTML)
|
||||
|
||||
if innerHTML
|
||||
case @innerHTML = container.downcase
|
||||
when 'title', 'textarea'
|
||||
@tokenizer.contentModelFlag = :RCDATA
|
||||
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
|
||||
@tokenizer.contentModelFlag = :CDATA
|
||||
when 'plaintext'
|
||||
@tokenizer.contentModelFlag = :PLAINTEXT
|
||||
else
|
||||
# contentModelFlag already is PCDATA
|
||||
#@tokenizer.contentModelFlag = :PCDATA
|
||||
end
|
||||
|
||||
@phase = @phases[:rootElement]
|
||||
@phase.insertHtmlElement
|
||||
resetInsertionMode
|
||||
else
|
||||
@innerHTML = false
|
||||
@phase = @phases[:initial]
|
||||
end
|
||||
|
||||
# We only seem to have InBodyPhase testcases where the following is
|
||||
# relevant ... need others too
|
||||
@lastPhase = nil
|
||||
|
||||
# XXX This is temporary for the moment so there isn't any other
|
||||
# changes needed for the parser to work with the iterable tokenizer
|
||||
@tokenizer.each do |token|
|
||||
token = normalizeToken(token)
|
||||
|
||||
method = 'process%s' % token[:type]
|
||||
|
||||
case token[:type]
|
||||
when :Characters, :SpaceCharacters, :Comment
|
||||
@phase.send method, token[:data]
|
||||
when :StartTag
|
||||
@phase.send method, token[:name], token[:data]
|
||||
when :EndTag
|
||||
@phase.send method, token[:name]
|
||||
when :Doctype
|
||||
@phase.send method, token[:name], token[:publicId],
|
||||
token[:systemId], token[:correct]
|
||||
else
|
||||
parseError(token[:data])
|
||||
end
|
||||
end
|
||||
|
||||
# When the loop finishes it's EOF
|
||||
@phase.processEOF
|
||||
end
|
||||
|
||||
# Parse a HTML document into a well-formed tree
|
||||
#
|
||||
# stream - a filelike object or string containing the HTML to be parsed
|
||||
#
|
||||
# The optional encoding parameter must be a string that indicates
|
||||
# the encoding. If specified, that encoding will be used,
|
||||
# regardless of any BOM or later declaration (such as in a meta
|
||||
# element)
|
||||
def parse(stream, encoding=nil)
|
||||
_parse(stream, false, encoding)
|
||||
return @tree.getDocument
|
||||
end
|
||||
|
||||
# Parse a HTML fragment into a well-formed tree fragment
|
||||
|
||||
# container - name of the element we're setting the innerHTML property
|
||||
# if set to nil, default to 'div'
|
||||
#
|
||||
# stream - a filelike object or string containing the HTML to be parsed
|
||||
#
|
||||
# The optional encoding parameter must be a string that indicates
|
||||
# the encoding. If specified, that encoding will be used,
|
||||
# regardless of any BOM or later declaration (such as in a meta
|
||||
# element)
|
||||
def parseFragment(stream, container='div', encoding=nil)
|
||||
_parse(stream, true, encoding, container)
|
||||
return @tree.getFragment
|
||||
end
|
||||
|
||||
def parseError(data = 'XXX ERROR MESSAGE NEEDED')
|
||||
# XXX The idea is to make data mandatory.
|
||||
@errors.push([@tokenizer.stream.position, data])
|
||||
raise ParseError if @strict
|
||||
end
|
||||
|
||||
# HTML5 specific normalizations to the token stream
|
||||
def normalizeToken(token)
|
||||
|
||||
if token[:type] == :EmptyTag
|
||||
# When a solidus (/) is encountered within a tag name what happens
|
||||
# depends on whether the current tag name matches that of a void
|
||||
# element. If it matches a void element atheists did the wrong
|
||||
# thing and if it doesn't it's wrong for everyone.
|
||||
|
||||
unless VOID_ELEMENTS.include?(token[:name])
|
||||
parseError(_('Solidus (/) incorrectly placed in tag.'))
|
||||
end
|
||||
|
||||
token[:type] = :StartTag
|
||||
end
|
||||
|
||||
if token[:type] == :StartTag
|
||||
token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
|
||||
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
unless token[:data].empty?
|
||||
data = token[:data].reverse.map { |attr, value| [attr.tr(ASCII_UPPERCASE, ASCII_LOWERCASE), value] }
|
||||
token[:data] = Hash[*data.flatten]
|
||||
end
|
||||
|
||||
elsif token[:type] == :EndTag
|
||||
parseError(_('End tag contains unexpected attributes.')) unless token[:data].empty?
|
||||
token[:name] = token[:name].downcase
|
||||
end
|
||||
|
||||
return token
|
||||
end
|
||||
|
||||
@@new_modes = {
|
||||
'select' => :inSelect,
|
||||
'td' => :inCell,
|
||||
'th' => :inCell,
|
||||
'tr' => :inRow,
|
||||
'tbody' => :inTableBody,
|
||||
'thead' => :inTableBody,
|
||||
'tfoot' => :inTableBody,
|
||||
'caption' => :inCaption,
|
||||
'colgroup' => :inColumnGroup,
|
||||
'table' => :inTable,
|
||||
'head' => :inBody,
|
||||
'body' => :inBody,
|
||||
'frameset' => :inFrameset
|
||||
}
|
||||
|
||||
def resetInsertionMode
|
||||
# The name of this method is mostly historical. (It's also used in the
|
||||
# specification.)
|
||||
last = false
|
||||
|
||||
@tree.openElements.reverse.each do |node|
|
||||
nodeName = node.name
|
||||
|
||||
if node == @tree.openElements[0]
|
||||
last = true
|
||||
unless ['td', 'th'].include?(nodeName)
|
||||
# XXX
|
||||
# assert @innerHTML
|
||||
nodeName = @innerHTML
|
||||
end
|
||||
end
|
||||
|
||||
# Check for conditions that should only happen in the innerHTML
|
||||
# case
|
||||
if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName)
|
||||
# XXX
|
||||
# assert @innerHTML
|
||||
end
|
||||
|
||||
if @@new_modes.has_key?(nodeName)
|
||||
@phase = @phases[@@new_modes[nodeName]]
|
||||
elsif nodeName == 'html'
|
||||
@phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead]
|
||||
elsif last
|
||||
@phase = @phases[:inBody]
|
||||
else
|
||||
next
|
||||
end
|
||||
|
||||
break
|
||||
end
|
||||
end
|
||||
|
||||
def _(string); string; end
|
||||
end
|
||||
|
||||
end
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class AfterBodyPhase < Phase
|
||||
|
||||
handle_end 'html'
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class AfterFramesetPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#after3
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class AfterHeadPhase < Phase
|
||||
|
||||
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
|
|
@ -1,11 +1,11 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class BeforeHeadPhase < Phase
|
||||
|
||||
handle_start 'html', 'head'
|
||||
|
||||
handle_end %w( html head body br ) => 'ImplyHead'
|
||||
handle_end %w( html head body br p ) => 'ImplyHead'
|
||||
|
||||
def processEOF
|
||||
startTagHead('head', {})
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InBodyPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
|
||||
|
@ -112,7 +112,7 @@ module HTML5lib
|
|||
|
||||
def startTagForm(name, attributes)
|
||||
if @tree.formPointer
|
||||
@parser.parseError('Unexpected start tag (form). Ignored.')
|
||||
@parser.parseError(_('Unexpected start tag (form). Ignored.'))
|
||||
else
|
||||
endTagP('p') if in_scope?('p')
|
||||
@tree.insertElement(name, attributes)
|
||||
|
@ -129,9 +129,9 @@ module HTML5lib
|
|||
if stopName.include?(node.name)
|
||||
poppedNodes = (0..i).collect { @tree.openElements.pop }
|
||||
if i >= 1
|
||||
@parser.parseError("Missing end tag%s (%s)" % [
|
||||
@parser.parseError(_("Missing end tag%s (%s)" % [
|
||||
(i>1 ? 's' : ''),
|
||||
poppedNodes.reverse.map {|item| item.name}.join(', ')])
|
||||
poppedNodes.reverse.map {|item| item.name}.join(', ')]))
|
||||
end
|
||||
break
|
||||
end
|
||||
|
@ -251,7 +251,7 @@ module HTML5lib
|
|||
end
|
||||
|
||||
def startTagIsindex(name, attributes)
|
||||
@parser.parseError("Unexpected start tag isindex. Don't use it!")
|
||||
@parser.parseError(_("Unexpected start tag isindex. Don't use it!"))
|
||||
return if @tree.formPointer
|
||||
processStartTag('form', {})
|
||||
processStartTag('hr', {})
|
||||
|
@ -311,8 +311,13 @@ module HTML5lib
|
|||
|
||||
def endTagP(name)
|
||||
@tree.generateImpliedEndTags('p') if in_scope?('p')
|
||||
@parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p'
|
||||
@tree.openElements.pop while in_scope?('p')
|
||||
@parser.parseError(_('Unexpected end tag (p).')) unless @tree.openElements[-1].name == 'p'
|
||||
if in_scope?('p')
|
||||
@tree.openElements.pop while in_scope?('p')
|
||||
else
|
||||
startTagCloseP('p', {})
|
||||
endTagP('p')
|
||||
end
|
||||
end
|
||||
|
||||
def endTagBody(name)
|
||||
|
@ -342,7 +347,7 @@ module HTML5lib
|
|||
@tree.generateImpliedEndTags if in_scope?(name)
|
||||
|
||||
unless @tree.openElements[-1].name == name
|
||||
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
|
||||
@parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag."))
|
||||
end
|
||||
|
||||
if in_scope?(name)
|
||||
|
@ -351,7 +356,14 @@ module HTML5lib
|
|||
end
|
||||
|
||||
def endTagForm(name)
|
||||
endTagBlock(name)
|
||||
if in_scope?(name)
|
||||
@tree.generateImpliedEndTags
|
||||
end
|
||||
if @tree.openElements[-1].name != name
|
||||
@parser.parseError(_("End tag (form) seen too early. Ignored."))
|
||||
else
|
||||
@tree.openElements.pop
|
||||
end
|
||||
@tree.formPointer = nil
|
||||
end
|
||||
|
||||
|
@ -361,7 +373,7 @@ module HTML5lib
|
|||
@tree.generateImpliedEndTags(name)
|
||||
|
||||
unless @tree.openElements[-1].name == name
|
||||
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
|
||||
@parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag."))
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -377,7 +389,7 @@ module HTML5lib
|
|||
end
|
||||
|
||||
unless @tree.openElements[-1].name == name
|
||||
@parser.parseError(("Unexpected end tag (#{name}). Expected other end tag."))
|
||||
@parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag."))
|
||||
end
|
||||
|
||||
HEADING_ELEMENTS.each do |element|
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InCaptionPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InCellPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InColumnGroupPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InFramesetPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
|
|
@ -1,12 +1,12 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InHeadPhase < Phase
|
||||
|
||||
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
|
||||
|
||||
handle_end 'head'
|
||||
handle_end %w( html body br ) => 'ImplyAfterHead'
|
||||
handle_end %w( html body br p ) => 'ImplyAfterHead'
|
||||
handle_end %w( title style script )
|
||||
|
||||
def processEOF
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InRowPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InSelectPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InTableBodyPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InTablePhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InitialPhase < Phase
|
||||
|
||||
# This phase deals with error handling as well which is currently not
|
|
@ -1,4 +1,4 @@
|
|||
module HTML5lib
|
||||
module HTML5
|
||||
# Base class for helper objects that implement each phase of processing.
|
||||
#
|
||||
# Handler methods should be in the following order (they can be omitted):
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class RootElementPhase < Phase
|
||||
|
||||
def processEOF
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class TrailingEndPhase < Phase
|
||||
|
||||
def processEOF
|
|
@ -1,7 +1,7 @@
|
|||
require 'stringio'
|
||||
require 'html5lib/constants'
|
||||
require 'html5/constants'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
# Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
|
@ -10,7 +10,7 @@ module HTML5lib
|
|||
|
||||
class HTMLInputStream
|
||||
|
||||
attr_accessor :queue, :char_encoding
|
||||
attr_accessor :queue, :char_encoding, :errors
|
||||
|
||||
# Initialises the HTMLInputStream.
|
||||
#
|
||||
|
@ -40,25 +40,31 @@ module HTML5lib
|
|||
#Number of bytes to use when looking for a meta element with
|
||||
#encoding information
|
||||
@NUM_BYTES_META = 512
|
||||
#Number of bytes to use when using detecting encoding using chardet
|
||||
@NUM_BYTES_CHARDET = 256
|
||||
#Number of bytes to use when reading content
|
||||
@NUM_BYTES_BUFFER = 1024
|
||||
|
||||
#Encoding to use if no other information can be found
|
||||
@DEFAULT_ENCODING = 'windows-1252'
|
||||
|
||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||
if @encoding.nil? or not HTML5lib.is_valid_encoding(@encoding)
|
||||
if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
|
||||
@char_encoding = detect_encoding
|
||||
else
|
||||
@char_encoding = @encoding
|
||||
end
|
||||
|
||||
# Read bytes from stream decoding them into Unicode
|
||||
uString = @raw_stream.read
|
||||
@buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
|
||||
if @char_encoding == 'windows-1252'
|
||||
@win1252 = true
|
||||
elsif @char_encoding != 'utf-8'
|
||||
begin
|
||||
require 'iconv'
|
||||
begin
|
||||
uString = Iconv.iconv('utf-8', @char_encoding, uString).first
|
||||
@buffer << @raw_stream.read unless @raw_stream.eof?
|
||||
@buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
|
||||
rescue
|
||||
@win1252 = true
|
||||
end
|
||||
|
@ -67,10 +73,8 @@ module HTML5lib
|
|||
end
|
||||
end
|
||||
|
||||
# Convert the unicode string into a list to be used as the data stream
|
||||
@data_stream = uString
|
||||
|
||||
@queue = []
|
||||
@errors = []
|
||||
|
||||
# Reset position in the list to read from
|
||||
@tell = 0
|
||||
|
@ -109,9 +113,22 @@ module HTML5lib
|
|||
begin
|
||||
require 'rubygems'
|
||||
require 'UniversalDetector' # gem install chardet
|
||||
buffer = @raw_stream.read
|
||||
encoding = UniversalDetector::chardet(buffer)['encoding']
|
||||
seek(buffer, 0)
|
||||
buffers = []
|
||||
detector = UniversalDetector::Detector.instance
|
||||
detector.reset
|
||||
until @raw_stream.eof?
|
||||
buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
|
||||
break if !buffer or buffer.empty?
|
||||
buffers << buffer
|
||||
detector.feed(buffer)
|
||||
break if detector.instance_eval {@done}
|
||||
detector.instance_eval {
|
||||
@_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
|
||||
}
|
||||
end
|
||||
detector.close
|
||||
encoding = detector.result['encoding']
|
||||
seek(buffers*'', 0)
|
||||
rescue LoadError
|
||||
end
|
||||
end
|
||||
|
@ -242,14 +259,20 @@ module HTML5lib
|
|||
unless @queue.empty?
|
||||
return @queue.shift
|
||||
else
|
||||
c = @data_stream[@tell]
|
||||
if @tell + 3 > @buffer.length and !@raw_stream.eof?
|
||||
# read next block
|
||||
@buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
|
||||
@tell = 0
|
||||
end
|
||||
|
||||
c = @buffer[@tell]
|
||||
@tell += 1
|
||||
|
||||
case c
|
||||
when 0x01 .. 0x7F
|
||||
if c == 0x0D
|
||||
# normalize newlines
|
||||
@tell += 1 if @data_stream[@tell] == 0x0A
|
||||
@tell += 1 if @buffer[@tell] == 0x0A
|
||||
c = 0x0A
|
||||
end
|
||||
|
||||
|
@ -276,7 +299,7 @@ module HTML5lib
|
|||
when 0xC0 .. 0xFF
|
||||
if @win1252
|
||||
"\xC3" + (c-64).chr # convert to utf-8
|
||||
elsif @data_stream[@tell-1 .. -1] =~ /^
|
||||
elsif @buffer[@tell-1 .. @tell+3] =~ /^
|
||||
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
||||
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
||||
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
||||
|
@ -292,6 +315,8 @@ module HTML5lib
|
|||
end
|
||||
|
||||
when 0x00
|
||||
@errors.push('null character found in input stream, ' +
|
||||
'replaced with U+FFFD')
|
||||
[0xFFFD].pack('U') # null characters are invalid
|
||||
|
||||
else
|
||||
|
@ -317,6 +342,10 @@ module HTML5lib
|
|||
@queue.insert(0, c) unless c == :EOF
|
||||
return char_stack.join('')
|
||||
end
|
||||
|
||||
def unget(characters)
|
||||
@queue.unshift(*characters.to_a) unless characters == :EOF
|
||||
end
|
||||
end
|
||||
|
||||
# String-like object with an assosiated position and various extra methods
|
||||
|
@ -433,14 +462,14 @@ module HTML5lib
|
|||
|
||||
if attr[0] == 'charset'
|
||||
tentative_encoding = attr[1]
|
||||
if HTML5lib.is_valid_encoding(tentative_encoding)
|
||||
if HTML5.is_valid_encoding(tentative_encoding)
|
||||
@encoding = tentative_encoding
|
||||
return false
|
||||
end
|
||||
elsif attr[0] == 'content'
|
||||
content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
|
||||
tentative_encoding = content_parser.parse
|
||||
if HTML5lib.is_valid_encoding(tentative_encoding)
|
||||
if HTML5.is_valid_encoding(tentative_encoding)
|
||||
@encoding = tentative_encoding
|
||||
return false
|
||||
end
|
|
@ -11,10 +11,10 @@
|
|||
#
|
||||
# @@TODO:
|
||||
# * Selectively lowercase only XHTML, but not foreign markup
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5lib/constants'
|
||||
require 'html5/html5parser'
|
||||
require 'html5/constants'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
# liberal XML parser
|
||||
class XMLParser < HTMLParser
|
||||
|
@ -25,25 +25,35 @@ module HTML5lib
|
|||
end
|
||||
|
||||
def normalizeToken(token)
|
||||
if token[:type] == :StartTag or token[:type] == :EmptyTag
|
||||
case token[:type]
|
||||
when :StartTag, :EmptyTag
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
# to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
token[:data] = Hash[*token[:data].reverse.flatten]
|
||||
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token[:type] == :EmptyTag
|
||||
save = @tokenizer.contentModelFlag
|
||||
@phase.processStartTag(token[:name], token[:data])
|
||||
@tokenizer.contentModelFlag = save
|
||||
token[:data] = {}
|
||||
token[:type] = :EndTag
|
||||
end
|
||||
|
||||
elsif token[:type] == :EndTag
|
||||
when :Characters
|
||||
# un-escape RCDATA_ELEMENTS (e.g. style, script)
|
||||
if @tokenizer.contentModelFlag == :CDATA
|
||||
token[:data] = token[:data].
|
||||
gsub('<','<').gsub('>','>').gsub('&','&')
|
||||
end
|
||||
|
||||
when :EndTag
|
||||
if token[:data]
|
||||
parseError(_("End tag contains unexpected attributes."))
|
||||
end
|
||||
|
||||
elsif token[:type] == :Comment
|
||||
when :Comment
|
||||
# Rescue CDATA from the comments
|
||||
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
|
||||
token[:type] = :Characters
|
|
@ -1,6 +1,7 @@
|
|||
require 'cgi'
|
||||
require 'html5/tokenizer'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
# This module provides sanitization of XHTML+MathML+SVG
|
||||
# and of inline style attributes.
|
2
vendor/plugins/HTML5lib/lib/html5/serializer.rb
vendored
Normal file
2
vendor/plugins/HTML5lib/lib/html5/serializer.rb
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
require 'html5/serializer/htmlserializer'
|
||||
require 'html5/serializer/xhtmlserializer'
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5/constants'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
class HTMLSerializer
|
||||
|
||||
|
@ -21,6 +21,7 @@ module HTML5lib
|
|||
@use_trailing_solidus = false
|
||||
@space_before_trailing_solidus = true
|
||||
@escape_lt_in_attrs = false
|
||||
@escape_rcdata = false
|
||||
|
||||
@omit_optional_tags = true
|
||||
@sanitize = false
|
||||
|
@ -43,22 +44,22 @@ module HTML5lib
|
|||
@errors = []
|
||||
|
||||
if encoding and @inject_meta_charset
|
||||
require 'html5lib/filters/inject_meta_charset'
|
||||
require 'html5/filters/inject_meta_charset'
|
||||
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
|
||||
end
|
||||
|
||||
if @strip_whitespace
|
||||
require 'html5lib/filters/whitespace'
|
||||
require 'html5/filters/whitespace'
|
||||
treewalker = Filters::WhitespaceFilter.new(treewalker)
|
||||
end
|
||||
|
||||
if @sanitize
|
||||
require 'html5lib/filters/sanitizer'
|
||||
require 'html5/filters/sanitizer'
|
||||
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
|
||||
end
|
||||
|
||||
if @omit_optional_tags
|
||||
require 'html5lib/filters/optionaltags'
|
||||
require 'html5/filters/optionaltags'
|
||||
treewalker = Filters::OptionalTagFilter.new(treewalker)
|
||||
end
|
||||
|
||||
|
@ -81,7 +82,7 @@ module HTML5lib
|
|||
|
||||
elsif [:StartTag, :EmptyTag].include? type
|
||||
name = token[:name]
|
||||
if RCDATA_ELEMENTS.include?(name)
|
||||
if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata
|
||||
in_cdata = true
|
||||
elsif in_cdata
|
||||
serializeError(_("Unexpected child element of a CDATA element"))
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/serializer/htmlserializer'
|
||||
require 'html5/serializer/htmlserializer'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
class XHTMLSerializer < HTMLSerializer
|
||||
DEFAULTS = {
|
||||
|
@ -8,7 +8,8 @@ module HTML5lib
|
|||
:minimize_boolean_attributes => false,
|
||||
:use_trailing_solidus => true,
|
||||
:escape_lt_in_attrs => true,
|
||||
:omit_optional_tags => false
|
||||
:omit_optional_tags => false,
|
||||
:escape_rcdata => true
|
||||
}
|
||||
|
||||
def initialize(options={})
|
|
@ -1,7 +1,7 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/inputstream'
|
||||
require 'html5/constants'
|
||||
require 'html5/inputstream'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
# This class takes care of tokenizing HTML.
|
||||
#
|
||||
|
@ -84,9 +84,9 @@ module HTML5lib
|
|||
# Start processing. When EOF is reached @state will return false
|
||||
# instead of true and the loop will terminate.
|
||||
while send @state
|
||||
while not @tokenQueue.empty?
|
||||
yield @tokenQueue.shift
|
||||
end
|
||||
yield :type => :ParseError, :data => @stream.errors.shift until
|
||||
@stream.errors.empty?
|
||||
yield @tokenQueue.shift until @tokenQueue.empty?
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -109,7 +109,7 @@ module HTML5lib
|
|||
|
||||
# The character we just consumed need to be put back on the stack so it
|
||||
# doesn't get lost...
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
end
|
||||
|
||||
# This function returns either U+FFFD or the character based on the
|
||||
|
@ -128,7 +128,6 @@ module HTML5lib
|
|||
radix = 16
|
||||
end
|
||||
|
||||
char = [0xFFFD].pack('U')
|
||||
charStack = []
|
||||
|
||||
# Consume all the characters that are in range while making sure we
|
||||
|
@ -142,17 +141,25 @@ module HTML5lib
|
|||
# Convert the set of characters consumed to an int.
|
||||
charAsInt = charStack.join('').to_i(radix)
|
||||
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
||||
# smaller) we need to do the "windows trick".
|
||||
if (127...160).include? charAsInt
|
||||
if charAsInt == 13
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Incorrect CR newline entity. Replaced with LF.")})
|
||||
charAsInt = 10
|
||||
elsif (128..159).include? charAsInt
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159
|
||||
# and smaller) we need to do the "windows trick".
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Entity used with illegal number (windows-1252 reference).")})
|
||||
|
||||
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
|
||||
end
|
||||
|
||||
if charAsInt > 0 and charAsInt <= 1114111
|
||||
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
|
||||
char = [charAsInt].pack('U')
|
||||
else
|
||||
char = [0xFFFD].pack('U')
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity represents an illegal codepoint.")})
|
||||
end
|
||||
|
||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||
|
@ -160,18 +167,18 @@ module HTML5lib
|
|||
if c != ";"
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity didn't end with ';'.")})
|
||||
@stream.queue.push(c)
|
||||
@stream.unget(c)
|
||||
end
|
||||
|
||||
return char
|
||||
end
|
||||
|
||||
def consumeEntity
|
||||
def consumeEntity(from_attribute=false)
|
||||
char = nil
|
||||
charStack = [@stream.char]
|
||||
if SPACE_CHARACTERS.include?(charStack[0]) or
|
||||
[:EOF, '<', '&'].include?(charStack[0])
|
||||
@stream.queue+= charStack
|
||||
@stream.unget(charStack)
|
||||
elsif charStack[0] == "#"
|
||||
# We might have a number entity here.
|
||||
charStack += [@stream.char, @stream.char]
|
||||
|
@ -179,22 +186,22 @@ module HTML5lib
|
|||
# If we reach the end of the file put everything up to :EOF
|
||||
# back in the queue
|
||||
charStack = charStack[0...charStack.index(:EOF)]
|
||||
@stream.queue+= charStack
|
||||
@stream.unget(charStack)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity expected. Got end of file instead.")})
|
||||
else
|
||||
if charStack[1].downcase == "x" \
|
||||
and HEX_DIGITS.include? charStack[2]
|
||||
# Hexadecimal entity detected.
|
||||
@stream.queue.push(charStack[2])
|
||||
@stream.unget(charStack[2])
|
||||
char = consumeNumberEntity(true)
|
||||
elsif DIGITS.include? charStack[1]
|
||||
# Decimal entity detected.
|
||||
@stream.queue += charStack[1..-1]
|
||||
@stream.unget(charStack[1..-1])
|
||||
char = consumeNumberEntity(false)
|
||||
else
|
||||
# No number entity detected.
|
||||
@stream.queue += charStack
|
||||
@stream.unget(charStack)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity expected but none found.")})
|
||||
end
|
||||
|
@ -209,6 +216,8 @@ module HTML5lib
|
|||
filteredEntityList.reject! {|e| e[0].chr != charStack[0]}
|
||||
entityName = nil
|
||||
|
||||
# Try to find the longest entity the string will match to take care
|
||||
# of ¬i for instance.
|
||||
while charStack[-1] != :EOF
|
||||
name = charStack.join('')
|
||||
if filteredEntityList.any? {|e| e[0...name.length] == name}
|
||||
|
@ -220,6 +229,7 @@ module HTML5lib
|
|||
|
||||
if ENTITIES.include? name
|
||||
entityName = name
|
||||
break if entityName[-1] == ';'
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -228,15 +238,23 @@ module HTML5lib
|
|||
|
||||
# Check whether or not the last character returned can be
|
||||
# discarded or needs to be put back.
|
||||
if not charStack[-1] == ";"
|
||||
if entityName[-1] != ?;
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Named entity didn't end with ';'.")})
|
||||
@stream.queue += charStack[entityName.length..-1]
|
||||
end
|
||||
|
||||
if charStack[-1] != ";" and from_attribute and
|
||||
(ASCII_LETTERS.include?(charStack[entityName.length]) or
|
||||
DIGITS.include?(charStack[entityName.length]))
|
||||
@stream.unget(charStack)
|
||||
char = '&'
|
||||
else
|
||||
@stream.unget(charStack[entityName.length..-1])
|
||||
end
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Named entity expected. Got none.")})
|
||||
@stream.queue += charStack
|
||||
@stream.unget(charStack)
|
||||
end
|
||||
end
|
||||
return char
|
||||
|
@ -244,7 +262,7 @@ module HTML5lib
|
|||
|
||||
# This method replaces the need for "entityInAttributeValueState".
|
||||
def processEntityInAttribute
|
||||
entity = consumeEntity
|
||||
entity = consumeEntity(true)
|
||||
if entity
|
||||
@currentToken[:data][-1][1] += entity
|
||||
else
|
||||
|
@ -274,20 +292,23 @@ module HTML5lib
|
|||
@lastFourChars.shift if @lastFourChars.length > 4
|
||||
end
|
||||
|
||||
if data == "&" and [:PCDATA,:RCDATA].include?(@contentModelFlag)
|
||||
@state = @states[:entityData]
|
||||
if data == "&" and !@escapeFlag and
|
||||
[:PCDATA,:RCDATA].include?(@contentModelFlag)
|
||||
@state = @states[:entityData]
|
||||
|
||||
elsif data == "-" and [:CDATA,:RCDATA].include?(@contentModelFlag) and
|
||||
@escapeFlag == false and @lastFourChars.join('') == "<!--"
|
||||
elsif data == "-" and !@escapeFlag and
|
||||
[:CDATA,:RCDATA].include?(@contentModelFlag) and
|
||||
@lastFourChars.join('') == "<!--"
|
||||
@escapeFlag = true
|
||||
@tokenQueue.push({:type => :Characters, :data => data})
|
||||
|
||||
elsif data == "<" and @escapeFlag == false and
|
||||
elsif data == "<" and !@escapeFlag and
|
||||
[:PCDATA,:CDATA,:RCDATA].include?(@contentModelFlag)
|
||||
@state = @states[:tagOpen]
|
||||
|
||||
elsif data == ">" and [:CDATA,:RCDATA].include?(@contentModelFlag) and
|
||||
@escapeFlag == true and @lastFourChars[1..-1].join('') == "-->"
|
||||
elsif data == ">" and @escapeFlag and
|
||||
[:CDATA,:RCDATA].include?(@contentModelFlag) and
|
||||
@lastFourChars[1..-1].join('') == "-->"
|
||||
@escapeFlag = false
|
||||
@tokenQueue.push({:type => :Characters, :data => data})
|
||||
|
||||
|
@ -345,14 +366,14 @@ module HTML5lib
|
|||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected tag name. Got '?' instead (HTML doesn't " +
|
||||
"support processing instructions).")})
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
@state = @states[:bogusComment]
|
||||
else
|
||||
# XXX
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected tag name. Got something else instead")})
|
||||
@tokenQueue.push({:type => :Characters, :data => "<"})
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
@state = @states[:data]
|
||||
end
|
||||
else
|
||||
|
@ -363,7 +384,7 @@ module HTML5lib
|
|||
@state = @states[:closeTagOpen]
|
||||
else
|
||||
@tokenQueue.push({:type => :Characters, :data => "<"})
|
||||
@stream.queue.insert(0, data)
|
||||
@stream.unget(data)
|
||||
@state = @states[:data]
|
||||
end
|
||||
end
|
||||
|
@ -388,7 +409,7 @@ module HTML5lib
|
|||
|
||||
# Since this is just for checking. We put the characters back on
|
||||
# the stack.
|
||||
@stream.queue += charStack
|
||||
@stream.unget(charStack)
|
||||
end
|
||||
|
||||
if @currentToken and
|
||||
|
@ -426,7 +447,7 @@ module HTML5lib
|
|||
# XXX data can be _'_...
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected closing tag. Unexpected character '#{data}' found.")})
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
@state = @states[:bogusComment]
|
||||
end
|
||||
|
||||
|
@ -556,7 +577,7 @@ module HTML5lib
|
|||
@state = @states[:attributeValueDoubleQuoted]
|
||||
elsif data == "&"
|
||||
@state = @states[:attributeValueUnQuoted]
|
||||
@stream.queue.push(data);
|
||||
@stream.unget(data);
|
||||
elsif data == "'"
|
||||
@state = @states[:attributeValueSingleQuoted]
|
||||
elsif data == ">"
|
||||
|
@ -656,7 +677,7 @@ module HTML5lib
|
|||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
||||
@stream.queue += charStack
|
||||
@stream.unget(charStack)
|
||||
@state = @states[:bogusComment]
|
||||
end
|
||||
end
|
||||
|
@ -771,7 +792,7 @@ module HTML5lib
|
|||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("No space after literal string 'DOCTYPE'.")})
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
@state = @states[:beforeDoctypeName]
|
||||
end
|
||||
return true
|
||||
|
@ -827,7 +848,7 @@ module HTML5lib
|
|||
@state = @states[:data]
|
||||
elsif data == :EOF
|
||||
@currentToken[:data] = true
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
@currentToken[:correct] = false
|
||||
|
@ -842,7 +863,7 @@ module HTML5lib
|
|||
elsif token == "system"
|
||||
@state = @states[:beforeDoctypeSystemIdentifier]
|
||||
else
|
||||
@stream.queue += charStack
|
||||
@stream.unget(charStack)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected 'public' or 'system'. Got '#{charStack.join('')}'")})
|
||||
@state = @states[:bogusDoctype]
|
||||
|
@ -1028,7 +1049,7 @@ module HTML5lib
|
|||
@state = @states[:data]
|
||||
elsif data == :EOF
|
||||
# XXX EMIT
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in bogus doctype.")})
|
||||
@currentToken[:correct] = false
|
|
@ -1,17 +1,17 @@
|
|||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeBuilders
|
||||
|
||||
class << self
|
||||
def [](name)
|
||||
case name.to_s.downcase
|
||||
when 'simpletree' then
|
||||
require 'html5lib/treebuilders/simpletree'
|
||||
require 'html5/treebuilders/simpletree'
|
||||
SimpleTree::TreeBuilder
|
||||
when 'rexml' then
|
||||
require 'html5lib/treebuilders/rexml'
|
||||
require 'html5/treebuilders/rexml'
|
||||
REXML::TreeBuilder
|
||||
when 'hpricot' then
|
||||
require 'html5lib/treebuilders/hpricot'
|
||||
require 'html5/treebuilders/hpricot'
|
||||
Hpricot::TreeBuilder
|
||||
else
|
||||
raise "Unknown TreeBuilder #{name}"
|
|
@ -1,8 +1,8 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5/constants'
|
||||
|
||||
#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
# The scope markers are inserted when entering buttons, object elements,
|
||||
# marquees, table cells, and table captions, and are used to prevent formatting
|
|
@ -1,221 +1,221 @@
|
|||
require 'html5lib/treebuilders/base'
|
||||
require 'rubygems'
|
||||
require 'hpricot'
|
||||
require 'forwardable'
|
||||
|
||||
module HTML5lib
|
||||
module TreeBuilders
|
||||
module Hpricot
|
||||
|
||||
class Node < Base::Node
|
||||
|
||||
extend Forwardable
|
||||
|
||||
def_delegators :@hpricot, :name
|
||||
|
||||
attr_accessor :hpricot
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
@hpricot = self.class.hpricot_class.new name
|
||||
end
|
||||
|
||||
def appendChild(node)
|
||||
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
|
||||
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
childNodes << node
|
||||
hpricot.children << node.hpricot
|
||||
end
|
||||
if (oldparent = node.hpricot.parent) != nil
|
||||
oldparent.children.delete_at(oldparent.children.index(node.hpricot))
|
||||
end
|
||||
node.hpricot.parent = hpricot
|
||||
node.parent = self
|
||||
end
|
||||
|
||||
def removeChild(node)
|
||||
childNodes.delete(node)
|
||||
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
|
||||
node.hpricot.parent = nil
|
||||
node.parent = nil
|
||||
end
|
||||
|
||||
def insertText(data, before=nil)
|
||||
if before
|
||||
insertBefore(TextNode.new(data), before)
|
||||
else
|
||||
appendChild(TextNode.new(data))
|
||||
end
|
||||
end
|
||||
|
||||
def insertBefore(node, refNode)
|
||||
index = childNodes.index(refNode)
|
||||
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
|
||||
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
|
||||
childNodes.insert(index, node)
|
||||
end
|
||||
end
|
||||
|
||||
def hasContent
|
||||
childNodes.any?
|
||||
end
|
||||
end
|
||||
|
||||
class Element < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Elem
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
|
||||
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
|
||||
end
|
||||
|
||||
def name
|
||||
@hpricot.stag.name
|
||||
end
|
||||
|
||||
def cloneNode
|
||||
attributes.inject(self.class.new(name)) do |node, (name, value)|
|
||||
node.hpricot[name] = value
|
||||
node
|
||||
end
|
||||
end
|
||||
|
||||
# A call to Hpricot::Elem#raw_attributes is built dynamically,
|
||||
# so alterations to the returned value (a hash) will be lost.
|
||||
#
|
||||
# AttributeProxy works around this by forwarding :[]= calls
|
||||
# to the raw_attributes accessor on the element start tag.
|
||||
#
|
||||
class AttributeProxy
|
||||
def initialize(hpricot)
|
||||
@hpricot = hpricot
|
||||
end
|
||||
|
||||
def []=(k, v)
|
||||
@hpricot.stag.send(stag_attributes_method)[k] = v
|
||||
end
|
||||
|
||||
def stag_attributes_method
|
||||
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
|
||||
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
|
||||
end
|
||||
|
||||
def method_missing(*a, &b)
|
||||
@hpricot.attributes.send(*a, &b)
|
||||
end
|
||||
end
|
||||
|
||||
def attributes
|
||||
AttributeProxy.new(@hpricot)
|
||||
end
|
||||
|
||||
def attributes=(attrs)
|
||||
attrs.each { |name, value| @hpricot[name] = value }
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
tree = "\n|#{' ' * indent}<#{name}>"
|
||||
indent += 2
|
||||
attributes.each do |name, value|
|
||||
next if name == 'xmlns'
|
||||
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
|
||||
end
|
||||
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
|
||||
end
|
||||
end
|
||||
|
||||
class Document < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Doc
|
||||
end
|
||||
|
||||
def initialize
|
||||
super(nil)
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentType < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::DocType
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
begin
|
||||
super(name)
|
||||
rescue ArgumentError # needs 3...
|
||||
end
|
||||
|
||||
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentFragment < Element
|
||||
def initialize
|
||||
super('')
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
|
||||
end
|
||||
end
|
||||
|
||||
class TextNode < Node
|
||||
def initialize(data)
|
||||
@hpricot = ::Hpricot::Text.new(data)
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}\"#{hpricot.content}\""
|
||||
end
|
||||
end
|
||||
|
||||
class CommentNode < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Comment
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
|
||||
end
|
||||
end
|
||||
|
||||
class TreeBuilder < Base::TreeBuilder
|
||||
def initialize
|
||||
@documentClass = Document
|
||||
@doctypeClass = DocumentType
|
||||
@elementClass = Element
|
||||
@commentClass = CommentNode
|
||||
@fragmentClass = DocumentFragment
|
||||
end
|
||||
|
||||
def testSerializer(node)
|
||||
node.printTree
|
||||
end
|
||||
|
||||
def getDocument
|
||||
@document.hpricot
|
||||
end
|
||||
|
||||
def getFragment
|
||||
@document = super
|
||||
return @document.hpricot.children
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
||||
require 'html5/treebuilders/base'
|
||||
require 'rubygems'
|
||||
require 'hpricot'
|
||||
require 'forwardable'
|
||||
|
||||
module HTML5
|
||||
module TreeBuilders
|
||||
module Hpricot
|
||||
|
||||
class Node < Base::Node
|
||||
|
||||
extend Forwardable
|
||||
|
||||
def_delegators :@hpricot, :name
|
||||
|
||||
attr_accessor :hpricot
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
@hpricot = self.class.hpricot_class.new name
|
||||
end
|
||||
|
||||
def appendChild(node)
|
||||
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
|
||||
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
childNodes << node
|
||||
hpricot.children << node.hpricot
|
||||
end
|
||||
if (oldparent = node.hpricot.parent) != nil
|
||||
oldparent.children.delete_at(oldparent.children.index(node.hpricot))
|
||||
end
|
||||
node.hpricot.parent = hpricot
|
||||
node.parent = self
|
||||
end
|
||||
|
||||
def removeChild(node)
|
||||
childNodes.delete(node)
|
||||
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
|
||||
node.hpricot.parent = nil
|
||||
node.parent = nil
|
||||
end
|
||||
|
||||
def insertText(data, before=nil)
|
||||
if before
|
||||
insertBefore(TextNode.new(data), before)
|
||||
else
|
||||
appendChild(TextNode.new(data))
|
||||
end
|
||||
end
|
||||
|
||||
def insertBefore(node, refNode)
|
||||
index = childNodes.index(refNode)
|
||||
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
|
||||
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
|
||||
childNodes.insert(index, node)
|
||||
end
|
||||
end
|
||||
|
||||
def hasContent
|
||||
childNodes.any?
|
||||
end
|
||||
end
|
||||
|
||||
class Element < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Elem
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
|
||||
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
|
||||
end
|
||||
|
||||
def name
|
||||
@hpricot.stag.name
|
||||
end
|
||||
|
||||
def cloneNode
|
||||
attributes.inject(self.class.new(name)) do |node, (name, value)|
|
||||
node.hpricot[name] = value
|
||||
node
|
||||
end
|
||||
end
|
||||
|
||||
# A call to Hpricot::Elem#raw_attributes is built dynamically,
|
||||
# so alterations to the returned value (a hash) will be lost.
|
||||
#
|
||||
# AttributeProxy works around this by forwarding :[]= calls
|
||||
# to the raw_attributes accessor on the element start tag.
|
||||
#
|
||||
class AttributeProxy
|
||||
def initialize(hpricot)
|
||||
@hpricot = hpricot
|
||||
end
|
||||
|
||||
def []=(k, v)
|
||||
@hpricot.stag.send(stag_attributes_method)[k] = v
|
||||
end
|
||||
|
||||
def stag_attributes_method
|
||||
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
|
||||
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
|
||||
end
|
||||
|
||||
def method_missing(*a, &b)
|
||||
@hpricot.attributes.send(*a, &b)
|
||||
end
|
||||
end
|
||||
|
||||
def attributes
|
||||
AttributeProxy.new(@hpricot)
|
||||
end
|
||||
|
||||
def attributes=(attrs)
|
||||
attrs.each { |name, value| @hpricot[name] = value }
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
tree = "\n|#{' ' * indent}<#{name}>"
|
||||
indent += 2
|
||||
attributes.each do |name, value|
|
||||
next if name == 'xmlns'
|
||||
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
|
||||
end
|
||||
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
|
||||
end
|
||||
end
|
||||
|
||||
class Document < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Doc
|
||||
end
|
||||
|
||||
def initialize
|
||||
super(nil)
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentType < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::DocType
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
begin
|
||||
super(name)
|
||||
rescue ArgumentError # needs 3...
|
||||
end
|
||||
|
||||
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentFragment < Element
|
||||
def initialize
|
||||
super('')
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
|
||||
end
|
||||
end
|
||||
|
||||
class TextNode < Node
|
||||
def initialize(data)
|
||||
@hpricot = ::Hpricot::Text.new(data)
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}\"#{hpricot.content}\""
|
||||
end
|
||||
end
|
||||
|
||||
class CommentNode < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Comment
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
|
||||
end
|
||||
end
|
||||
|
||||
class TreeBuilder < Base::TreeBuilder
|
||||
def initialize
|
||||
@documentClass = Document
|
||||
@doctypeClass = DocumentType
|
||||
@elementClass = Element
|
||||
@commentClass = CommentNode
|
||||
@fragmentClass = DocumentFragment
|
||||
end
|
||||
|
||||
def testSerializer(node)
|
||||
node.printTree
|
||||
end
|
||||
|
||||
def getDocument
|
||||
@document.hpricot
|
||||
end
|
||||
|
||||
def getFragment
|
||||
@document = super
|
||||
return @document.hpricot.children
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
|
@ -1,8 +1,8 @@
|
|||
require 'html5lib/treebuilders/base'
|
||||
require 'html5/treebuilders/base'
|
||||
require 'rexml/document'
|
||||
require 'forwardable'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeBuilders
|
||||
module REXML
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/treebuilders/base'
|
||||
require 'html5/treebuilders/base'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeBuilders
|
||||
module SimpleTree
|
||||
|
|
@ -1,19 +1,19 @@
|
|||
require 'html5lib/treewalkers/base'
|
||||
require 'html5/treewalkers/base'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeWalkers
|
||||
|
||||
class << self
|
||||
def [](name)
|
||||
case name.to_s.downcase
|
||||
when 'simpletree' then
|
||||
require 'html5lib/treewalkers/simpletree'
|
||||
require 'html5/treewalkers/simpletree'
|
||||
SimpleTree::TreeWalker
|
||||
when 'rexml' then
|
||||
require 'html5lib/treewalkers/rexml'
|
||||
require 'html5/treewalkers/rexml'
|
||||
REXML::TreeWalker
|
||||
when 'hpricot' then
|
||||
require 'html5lib/treewalkers/hpricot'
|
||||
require 'html5/treewalkers/hpricot'
|
||||
Hpricot::TreeWalker
|
||||
else
|
||||
raise "Unknown TreeWalker #{name}"
|
|
@ -1,5 +1,5 @@
|
|||
require 'html5lib/constants'
|
||||
module HTML5lib
|
||||
require 'html5/constants'
|
||||
module HTML5
|
||||
module TreeWalkers
|
||||
|
||||
module TokenConstructor
|
|
@ -1,10 +1,10 @@
|
|||
require 'html5lib/treewalkers/base'
|
||||
require 'html5/treewalkers/base'
|
||||
require 'rexml/document'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeWalkers
|
||||
module Hpricot
|
||||
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
|
||||
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
|
||||
|
||||
def node_details(node)
|
||||
case node
|
|
@ -1,10 +1,10 @@
|
|||
require 'html5lib/treewalkers/base'
|
||||
require 'html5/treewalkers/base'
|
||||
require 'rexml/document'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeWalkers
|
||||
module REXML
|
||||
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
|
||||
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
|
||||
|
||||
def node_details(node)
|
||||
case node
|
|
@ -1,10 +1,10 @@
|
|||
require 'html5lib/treewalkers/base'
|
||||
require 'html5/treewalkers/base'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeWalkers
|
||||
module SimpleTree
|
||||
class TreeWalker < HTML5lib::TreeWalkers::Base
|
||||
include HTML5lib::TreeBuilders::SimpleTree
|
||||
class TreeWalker < HTML5::TreeWalkers::Base
|
||||
include HTML5::TreeBuilders::SimpleTree
|
||||
|
||||
def walk(node)
|
||||
case node
|
708
vendor/plugins/HTML5lib/lib/html5lib/constants.rb
vendored
708
vendor/plugins/HTML5lib/lib/html5lib/constants.rb
vendored
|
@ -1,708 +0,0 @@
|
|||
module HTML5lib
|
||||
|
||||
class EOF < Exception; end
|
||||
|
||||
CONTENT_MODEL_FLAGS = [
|
||||
:PCDATA,
|
||||
:RCDATA,
|
||||
:CDATA,
|
||||
:PLAINTEXT
|
||||
]
|
||||
|
||||
SCOPING_ELEMENTS = %w[
|
||||
button
|
||||
caption
|
||||
html
|
||||
marquee
|
||||
object
|
||||
table
|
||||
td
|
||||
th
|
||||
]
|
||||
|
||||
FORMATTING_ELEMENTS = %w[
|
||||
a
|
||||
b
|
||||
big
|
||||
em
|
||||
font
|
||||
i
|
||||
nobr
|
||||
s
|
||||
small
|
||||
strike
|
||||
strong
|
||||
tt
|
||||
u
|
||||
]
|
||||
|
||||
SPECIAL_ELEMENTS = %w[
|
||||
address
|
||||
area
|
||||
base
|
||||
basefont
|
||||
bgsound
|
||||
blockquote
|
||||
body
|
||||
br
|
||||
center
|
||||
col
|
||||
colgroup
|
||||
dd
|
||||
dir
|
||||
div
|
||||
dl
|
||||
dt
|
||||
embed
|
||||
fieldset
|
||||
form
|
||||
frame
|
||||
frameset
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
h4
|
||||
h5
|
||||
h6
|
||||
head
|
||||
hr
|
||||
iframe
|
||||
image
|
||||
img
|
||||
input
|
||||
isindex
|
||||
li
|
||||
link
|
||||
listing
|
||||
menu
|
||||
meta
|
||||
noembed
|
||||
noframes
|
||||
noscript
|
||||
ol
|
||||
optgroup
|
||||
option
|
||||
p
|
||||
param
|
||||
plaintext
|
||||
pre
|
||||
script
|
||||
select
|
||||
spacer
|
||||
style
|
||||
tbody
|
||||
textarea
|
||||
tfoot
|
||||
thead
|
||||
title
|
||||
tr
|
||||
ul
|
||||
wbr
|
||||
]
|
||||
|
||||
SPACE_CHARACTERS = %W[
|
||||
\t
|
||||
\n
|
||||
\x0B
|
||||
\x0C
|
||||
\x20
|
||||
\r
|
||||
]
|
||||
|
||||
TABLE_INSERT_MODE_ELEMENTS = %w[
|
||||
table
|
||||
tbody
|
||||
tfoot
|
||||
thead
|
||||
tr
|
||||
]
|
||||
|
||||
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
|
||||
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
|
||||
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
|
||||
DIGITS = '0'..'9'
|
||||
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
|
||||
|
||||
# Heading elements need to be ordered
|
||||
HEADING_ELEMENTS = %w[
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
h4
|
||||
h5
|
||||
h6
|
||||
]
|
||||
|
||||
# XXX What about event-source and command?
|
||||
VOID_ELEMENTS = %w[
|
||||
base
|
||||
link
|
||||
meta
|
||||
hr
|
||||
br
|
||||
img
|
||||
embed
|
||||
param
|
||||
area
|
||||
col
|
||||
input
|
||||
]
|
||||
|
||||
CDATA_ELEMENTS = %w[title textarea]
|
||||
|
||||
RCDATA_ELEMENTS = %w[
|
||||
style
|
||||
script
|
||||
xmp
|
||||
iframe
|
||||
noembed
|
||||
noframes
|
||||
noscript
|
||||
]
|
||||
|
||||
BOOLEAN_ATTRIBUTES = {
|
||||
:global => %w[irrelevant],
|
||||
'style' => %w[scoped],
|
||||
'img' => %w[ismap],
|
||||
'audio' => %w[autoplay controls],
|
||||
'video' => %w[autoplay controls],
|
||||
'script' => %w[defer async],
|
||||
'details' => %w[open],
|
||||
'datagrid' => %w[multiple disabled],
|
||||
'command' => %w[hidden disabled checked default],
|
||||
'menu' => %w[autosubmit],
|
||||
'fieldset' => %w[disabled readonly],
|
||||
'option' => %w[disabled readonly selected],
|
||||
'optgroup' => %w[disabled readonly],
|
||||
'button' => %w[disabled autofocus],
|
||||
'input' => %w[disabled readonly required autofocus checked ismap],
|
||||
'select' => %w[disabled readonly autofocus multiple],
|
||||
'output' => %w[disabled readonly]
|
||||
}
|
||||
|
||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
|
||||
ENTITIES_WINDOWS1252 = [
|
||||
8364, # 0x80 0x20AC EURO SIGN
|
||||
65533, # 0x81 UNDEFINED
|
||||
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
||||
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
||||
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
||||
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
||||
8224, # 0x86 0x2020 DAGGER
|
||||
8225, # 0x87 0x2021 DOUBLE DAGGER
|
||||
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||
8240, # 0x89 0x2030 PER MILLE SIGN
|
||||
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
||||
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
||||
65533, # 0x8D UNDEFINED
|
||||
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
||||
65533, # 0x8F UNDEFINED
|
||||
65533, # 0x90 UNDEFINED
|
||||
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
||||
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
||||
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
||||
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
||||
8226, # 0x95 0x2022 BULLET
|
||||
8211, # 0x96 0x2013 EN DASH
|
||||
8212, # 0x97 0x2014 EM DASH
|
||||
732, # 0x98 0x02DC SMALL TILDE
|
||||
8482, # 0x99 0x2122 TRADE MARK SIGN
|
||||
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
||||
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
||||
65533, # 0x9D UNDEFINED
|
||||
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
||||
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
]
|
||||
|
||||
private
|
||||
|
||||
def self.U n
|
||||
[n].pack('U')
|
||||
end
|
||||
|
||||
public
|
||||
|
||||
ENTITIES = {
|
||||
"AElig" => U(0xC6),
|
||||
"Aacute" => U(0xC1),
|
||||
"Acirc" => U(0xC2),
|
||||
"Agrave" => U(0xC0),
|
||||
"Alpha" => U(0x0391),
|
||||
"Aring" => U(0xC5),
|
||||
"Atilde" => U(0xC3),
|
||||
"Auml" => U(0xC4),
|
||||
"Beta" => U(0x0392),
|
||||
"Ccedil" => U(0xC7),
|
||||
"Chi" => U(0x03A7),
|
||||
"Dagger" => U(0x2021),
|
||||
"Delta" => U(0x0394),
|
||||
"ETH" => U(0xD0),
|
||||
"Eacute" => U(0xC9),
|
||||
"Ecirc" => U(0xCA),
|
||||
"Egrave" => U(0xC8),
|
||||
"Epsilon" => U(0x0395),
|
||||
"Eta" => U(0x0397),
|
||||
"Euml" => U(0xCB),
|
||||
"Gamma" => U(0x0393),
|
||||
"Iacute" => U(0xCD),
|
||||
"Icirc" => U(0xCE),
|
||||
"Igrave" => U(0xCC),
|
||||
"Iota" => U(0x0399),
|
||||
"Iuml" => U(0xCF),
|
||||
"Kappa" => U(0x039A),
|
||||
"Lambda" => U(0x039B),
|
||||
"Mu" => U(0x039C),
|
||||
"Ntilde" => U(0xD1),
|
||||
"Nu" => U(0x039D),
|
||||
"OElig" => U(0x0152),
|
||||
"Oacute" => U(0xD3),
|
||||
"Ocirc" => U(0xD4),
|
||||
"Ograve" => U(0xD2),
|
||||
"Omega" => U(0x03A9),
|
||||
"Omicron" => U(0x039F),
|
||||
"Oslash" => U(0xD8),
|
||||
"Otilde" => U(0xD5),
|
||||
"Ouml" => U(0xD6),
|
||||
"Phi" => U(0x03A6),
|
||||
"Pi" => U(0x03A0),
|
||||
"Prime" => U(0x2033),
|
||||
"Psi" => U(0x03A8),
|
||||
"Rho" => U(0x03A1),
|
||||
"Scaron" => U(0x0160),
|
||||
"Sigma" => U(0x03A3),
|
||||
"THORN" => U(0xDE),
|
||||
"Tau" => U(0x03A4),
|
||||
"Theta" => U(0x0398),
|
||||
"Uacute" => U(0xDA),
|
||||
"Ucirc" => U(0xDB),
|
||||
"Ugrave" => U(0xD9),
|
||||
"Upsilon" => U(0x03A5),
|
||||
"Uuml" => U(0xDC),
|
||||
"Xi" => U(0x039E),
|
||||
"Yacute" => U(0xDD),
|
||||
"Yuml" => U(0x0178),
|
||||
"Zeta" => U(0x0396),
|
||||
"aacute" => U(0xE1),
|
||||
"acirc" => U(0xE2),
|
||||
"acute" => U(0xB4),
|
||||
"aelig" => U(0xE6),
|
||||
"agrave" => U(0xE0),
|
||||
"alefsym" => U(0x2135),
|
||||
"alpha" => U(0x03B1),
|
||||
"amp" => U(0x26),
|
||||
"AMP" => U(0x26),
|
||||
"and" => U(0x2227),
|
||||
"ang" => U(0x2220),
|
||||
"apos" => U(0x27),
|
||||
"aring" => U(0xE5),
|
||||
"asymp" => U(0x2248),
|
||||
"atilde" => U(0xE3),
|
||||
"auml" => U(0xE4),
|
||||
"bdquo" => U(0x201E),
|
||||
"beta" => U(0x03B2),
|
||||
"brvbar" => U(0xA6),
|
||||
"bull" => U(0x2022),
|
||||
"cap" => U(0x2229),
|
||||
"ccedil" => U(0xE7),
|
||||
"cedil" => U(0xB8),
|
||||
"cent" => U(0xA2),
|
||||
"chi" => U(0x03C7),
|
||||
"circ" => U(0x02C6),
|
||||
"clubs" => U(0x2663),
|
||||
"cong" => U(0x2245),
|
||||
"copy" => U(0xA9),
|
||||
"COPY" => U(0xA9),
|
||||
"crarr" => U(0x21B5),
|
||||
"cup" => U(0x222A),
|
||||
"curren" => U(0xA4),
|
||||
"dArr" => U(0x21D3),
|
||||
"dagger" => U(0x2020),
|
||||
"darr" => U(0x2193),
|
||||
"deg" => U(0xB0),
|
||||
"delta" => U(0x03B4),
|
||||
"diams" => U(0x2666),
|
||||
"divide" => U(0xF7),
|
||||
"eacute" => U(0xE9),
|
||||
"ecirc" => U(0xEA),
|
||||
"egrave" => U(0xE8),
|
||||
"empty" => U(0x2205),
|
||||
"emsp" => U(0x2003),
|
||||
"ensp" => U(0x2002),
|
||||
"epsilon" => U(0x03B5),
|
||||
"equiv" => U(0x2261),
|
||||
"eta" => U(0x03B7),
|
||||
"eth" => U(0xF0),
|
||||
"euml" => U(0xEB),
|
||||
"euro" => U(0x20AC),
|
||||
"exist" => U(0x2203),
|
||||
"fnof" => U(0x0192),
|
||||
"forall" => U(0x2200),
|
||||
"frac12" => U(0xBD),
|
||||
"frac14" => U(0xBC),
|
||||
"frac34" => U(0xBE),
|
||||
"frasl" => U(0x2044),
|
||||
"gamma" => U(0x03B3),
|
||||
"ge" => U(0x2265),
|
||||
"gt" => U(0x3E),
|
||||
"GT" => U(0x3E),
|
||||
"hArr" => U(0x21D4),
|
||||
"harr" => U(0x2194),
|
||||
"hearts" => U(0x2665),
|
||||
"hellip" => U(0x2026),
|
||||
"iacute" => U(0xED),
|
||||
"icirc" => U(0xEE),
|
||||
"iexcl" => U(0xA1),
|
||||
"igrave" => U(0xEC),
|
||||
"image" => U(0x2111),
|
||||
"infin" => U(0x221E),
|
||||
"int" => U(0x222B),
|
||||
"iota" => U(0x03B9),
|
||||
"iquest" => U(0xBF),
|
||||
"isin" => U(0x2208),
|
||||
"iuml" => U(0xEF),
|
||||
"kappa" => U(0x03BA),
|
||||
"lArr" => U(0x21D0),
|
||||
"lambda" => U(0x03BB),
|
||||
"lang" => U(0x2329),
|
||||
"laquo" => U(0xAB),
|
||||
"larr" => U(0x2190),
|
||||
"lceil" => U(0x2308),
|
||||
"ldquo" => U(0x201C),
|
||||
"le" => U(0x2264),
|
||||
"lfloor" => U(0x230A),
|
||||
"lowast" => U(0x2217),
|
||||
"loz" => U(0x25CA),
|
||||
"lrm" => U(0x200E),
|
||||
"lsaquo" => U(0x2039),
|
||||
"lsquo" => U(0x2018),
|
||||
"lt" => U(0x3C),
|
||||
"LT" => U(0x3C),
|
||||
"macr" => U(0xAF),
|
||||
"mdash" => U(0x2014),
|
||||
"micro" => U(0xB5),
|
||||
"middot" => U(0xB7),
|
||||
"minus" => U(0x2212),
|
||||
"mu" => U(0x03BC),
|
||||
"nabla" => U(0x2207),
|
||||
"nbsp" => U(0xA0),
|
||||
"ndash" => U(0x2013),
|
||||
"ne" => U(0x2260),
|
||||
"ni" => U(0x220B),
|
||||
"not" => U(0xAC),
|
||||
"notin" => U(0x2209),
|
||||
"nsub" => U(0x2284),
|
||||
"ntilde" => U(0xF1),
|
||||
"nu" => U(0x03BD),
|
||||
"oacute" => U(0xF3),
|
||||
"ocirc" => U(0xF4),
|
||||
"oelig" => U(0x0153),
|
||||
"ograve" => U(0xF2),
|
||||
"oline" => U(0x203E),
|
||||
"omega" => U(0x03C9),
|
||||
"omicron" => U(0x03BF),
|
||||
"oplus" => U(0x2295),
|
||||
"or" => U(0x2228),
|
||||
"ordf" => U(0xAA),
|
||||
"ordm" => U(0xBA),
|
||||
"oslash" => U(0xF8),
|
||||
"otilde" => U(0xF5),
|
||||
"otimes" => U(0x2297),
|
||||
"ouml" => U(0xF6),
|
||||
"para" => U(0xB6),
|
||||
"part" => U(0x2202),
|
||||
"permil" => U(0x2030),
|
||||
"perp" => U(0x22A5),
|
||||
"phi" => U(0x03C6),
|
||||
"pi" => U(0x03C0),
|
||||
"piv" => U(0x03D6),
|
||||
"plusmn" => U(0xB1),
|
||||
"pound" => U(0xA3),
|
||||
"prime" => U(0x2032),
|
||||
"prod" => U(0x220F),
|
||||
"prop" => U(0x221D),
|
||||
"psi" => U(0x03C8),
|
||||
"quot" => U(0x22),
|
||||
"QUOT" => U(0x22),
|
||||
"rArr" => U(0x21D2),
|
||||
"radic" => U(0x221A),
|
||||
"rang" => U(0x232A),
|
||||
"raquo" => U(0xBB),
|
||||
"rarr" => U(0x2192),
|
||||
"rceil" => U(0x2309),
|
||||
"rdquo" => U(0x201D),
|
||||
"real" => U(0x211C),
|
||||
"reg" => U(0xAE),
|
||||
"REG" => U(0xAE),
|
||||
"rfloor" => U(0x230B),
|
||||
"rho" => U(0x03C1),
|
||||
"rlm" => U(0x200F),
|
||||
"rsaquo" => U(0x203A),
|
||||
"rsquo" => U(0x2019),
|
||||
"sbquo" => U(0x201A),
|
||||
"scaron" => U(0x0161),
|
||||
"sdot" => U(0x22C5),
|
||||
"sect" => U(0xA7),
|
||||
"shy" => U(0xAD),
|
||||
"sigma" => U(0x03C3),
|
||||
"sigmaf" => U(0x03C2),
|
||||
"sim" => U(0x223C),
|
||||
"spades" => U(0x2660),
|
||||
"sub" => U(0x2282),
|
||||
"sube" => U(0x2286),
|
||||
"sum" => U(0x2211),
|
||||
"sup" => U(0x2283),
|
||||
"sup1" => U(0xB9),
|
||||
"sup2" => U(0xB2),
|
||||
"sup3" => U(0xB3),
|
||||
"supe" => U(0x2287),
|
||||
"szlig" => U(0xDF),
|
||||
"tau" => U(0x03C4),
|
||||
"there4" => U(0x2234),
|
||||
"theta" => U(0x03B8),
|
||||
"thetasym" => U(0x03D1),
|
||||
"thinsp" => U(0x2009),
|
||||
"thorn" => U(0xFE),
|
||||
"tilde" => U(0x02DC),
|
||||
"times" => U(0xD7),
|
||||
"trade" => U(0x2122),
|
||||
"uArr" => U(0x21D1),
|
||||
"uacute" => U(0xFA),
|
||||
"uarr" => U(0x2191),
|
||||
"ucirc" => U(0xFB),
|
||||
"ugrave" => U(0xF9),
|
||||
"uml" => U(0xA8),
|
||||
"upsih" => U(0x03D2),
|
||||
"upsilon" => U(0x03C5),
|
||||
"uuml" => U(0xFC),
|
||||
"weierp" => U(0x2118),
|
||||
"xi" => U(0x03BE),
|
||||
"yacute" => U(0xFD),
|
||||
"yen" => U(0xA5),
|
||||
"yuml" => U(0xFF),
|
||||
"zeta" => U(0x03B6),
|
||||
"zwj" => U(0x200D),
|
||||
"zwnj" => U(0x200C)
|
||||
}
|
||||
|
||||
ENCODINGS = %w[
|
||||
ansi_x3.4-1968
|
||||
iso-ir-6
|
||||
ansi_x3.4-1986
|
||||
iso_646.irv:1991
|
||||
ascii
|
||||
iso646-us
|
||||
us-ascii
|
||||
us
|
||||
ibm367
|
||||
cp367
|
||||
csascii
|
||||
ks_c_5601-1987
|
||||
korean
|
||||
iso-2022-kr
|
||||
csiso2022kr
|
||||
euc-kr
|
||||
iso-2022-jp
|
||||
csiso2022jp
|
||||
iso-2022-jp-2
|
||||
iso-ir-58
|
||||
chinese
|
||||
csiso58gb231280
|
||||
iso_8859-1:1987
|
||||
iso-ir-100
|
||||
iso_8859-1
|
||||
iso-8859-1
|
||||
latin1
|
||||
l1
|
||||
ibm819
|
||||
cp819
|
||||
csisolatin1
|
||||
iso_8859-2:1987
|
||||
iso-ir-101
|
||||
iso_8859-2
|
||||
iso-8859-2
|
||||
latin2
|
||||
l2
|
||||
csisolatin2
|
||||
iso_8859-3:1988
|
||||
iso-ir-109
|
||||
iso_8859-3
|
||||
iso-8859-3
|
||||
latin3
|
||||
l3
|
||||
csisolatin3
|
||||
iso_8859-4:1988
|
||||
iso-ir-110
|
||||
iso_8859-4
|
||||
iso-8859-4
|
||||
latin4
|
||||
l4
|
||||
csisolatin4
|
||||
iso_8859-6:1987
|
||||
iso-ir-127
|
||||
iso_8859-6
|
||||
iso-8859-6
|
||||
ecma-114
|
||||
asmo-708
|
||||
arabic
|
||||
csisolatinarabic
|
||||
iso_8859-7:1987
|
||||
iso-ir-126
|
||||
iso_8859-7
|
||||
iso-8859-7
|
||||
elot_928
|
||||
ecma-118
|
||||
greek
|
||||
greek8
|
||||
csisolatingreek
|
||||
iso_8859-8:1988
|
||||
iso-ir-138
|
||||
iso_8859-8
|
||||
iso-8859-8
|
||||
hebrew
|
||||
csisolatinhebrew
|
||||
iso_8859-5:1988
|
||||
iso-ir-144
|
||||
iso_8859-5
|
||||
iso-8859-5
|
||||
cyrillic
|
||||
csisolatincyrillic
|
||||
iso_8859-9:1989
|
||||
iso-ir-148
|
||||
iso_8859-9
|
||||
iso-8859-9
|
||||
latin5
|
||||
l5
|
||||
csisolatin5
|
||||
iso-8859-10
|
||||
iso-ir-157
|
||||
l6
|
||||
iso_8859-10:1992
|
||||
csisolatin6
|
||||
latin6
|
||||
hp-roman8
|
||||
roman8
|
||||
r8
|
||||
ibm037
|
||||
cp037
|
||||
csibm037
|
||||
ibm424
|
||||
cp424
|
||||
csibm424
|
||||
ibm437
|
||||
cp437
|
||||
437
|
||||
cspc8codepage437
|
||||
ibm500
|
||||
cp500
|
||||
csibm500
|
||||
ibm775
|
||||
cp775
|
||||
cspc775baltic
|
||||
ibm850
|
||||
cp850
|
||||
850
|
||||
cspc850multilingual
|
||||
ibm852
|
||||
cp852
|
||||
852
|
||||
cspcp852
|
||||
ibm855
|
||||
cp855
|
||||
855
|
||||
csibm855
|
||||
ibm857
|
||||
cp857
|
||||
857
|
||||
csibm857
|
||||
ibm860
|
||||
cp860
|
||||
860
|
||||
csibm860
|
||||
ibm861
|
||||
cp861
|
||||
861
|
||||
cp-is
|
||||
csibm861
|
||||
ibm862
|
||||
cp862
|
||||
862
|
||||
cspc862latinhebrew
|
||||
ibm863
|
||||
cp863
|
||||
863
|
||||
csibm863
|
||||
ibm864
|
||||
cp864
|
||||
csibm864
|
||||
ibm865
|
||||
cp865
|
||||
865
|
||||
csibm865
|
||||
ibm866
|
||||
cp866
|
||||
866
|
||||
csibm866
|
||||
ibm869
|
||||
cp869
|
||||
869
|
||||
cp-gr
|
||||
csibm869
|
||||
ibm1026
|
||||
cp1026
|
||||
csibm1026
|
||||
koi8-r
|
||||
cskoi8r
|
||||
koi8-u
|
||||
big5-hkscs
|
||||
ptcp154
|
||||
csptcp154
|
||||
pt154
|
||||
cp154
|
||||
utf-7
|
||||
utf-16be
|
||||
utf-16le
|
||||
utf-16
|
||||
utf-8
|
||||
iso-8859-13
|
||||
iso-8859-14
|
||||
iso-ir-199
|
||||
iso_8859-14:1998
|
||||
iso_8859-14
|
||||
latin8
|
||||
iso-celtic
|
||||
l8
|
||||
iso-8859-15
|
||||
iso_8859-15
|
||||
iso-8859-16
|
||||
iso-ir-226
|
||||
iso_8859-16:2001
|
||||
iso_8859-16
|
||||
latin10
|
||||
l10
|
||||
gbk
|
||||
cp936
|
||||
ms936
|
||||
gb18030
|
||||
shift_jis
|
||||
ms_kanji
|
||||
csshiftjis
|
||||
euc-jp
|
||||
gb2312
|
||||
big5
|
||||
csbig5
|
||||
windows-1250
|
||||
windows-1251
|
||||
windows-1252
|
||||
windows-1253
|
||||
windows-1254
|
||||
windows-1255
|
||||
windows-1256
|
||||
windows-1257
|
||||
windows-1258
|
||||
tis-620
|
||||
hz-gb-2312
|
||||
]
|
||||
|
||||
end
|
|
@ -1 +0,0 @@
|
|||
require 'html5lib/filters/optionaltags'
|
|
@ -1,2 +0,0 @@
|
|||
require 'html5lib/serializer/htmlserializer'
|
||||
require 'html5lib/serializer/xhtmlserializer'
|
24
vendor/plugins/HTML5lib/parse.rb
vendored
24
vendor/plugins/HTML5lib/parse.rb
vendored
|
@ -26,15 +26,15 @@ def parse(opts, args)
|
|||
exit(1)
|
||||
end
|
||||
|
||||
require 'html5lib/treebuilders'
|
||||
treebuilder = HTML5lib::TreeBuilders[opts.treebuilder]
|
||||
require 'html5/treebuilders'
|
||||
treebuilder = HTML5::TreeBuilders[opts.treebuilder]
|
||||
|
||||
if opts.output == :xml
|
||||
require 'html5lib/liberalxmlparser'
|
||||
p = HTML5lib::XHTMLParser.new(:tree=>treebuilder)
|
||||
require 'html5/liberalxmlparser'
|
||||
p = HTML5::XHTMLParser.new(:tree=>treebuilder)
|
||||
else
|
||||
require 'html5lib/html5parser'
|
||||
p = HTML5lib::HTMLParser.new(:tree=>treebuilder)
|
||||
require 'html5/html5parser'
|
||||
p = HTML5::HTMLParser.new(:tree=>treebuilder)
|
||||
end
|
||||
|
||||
if opts.parsemethod == :parse
|
||||
|
@ -70,10 +70,10 @@ def printOutput(parser, document, opts)
|
|||
when :xml
|
||||
print document
|
||||
when :html
|
||||
require 'html5lib/treewalkers'
|
||||
tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
|
||||
require 'html5lib/serializer'
|
||||
puts HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer)
|
||||
require 'html5/treewalkers'
|
||||
tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
|
||||
require 'html5/serializer'
|
||||
puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
|
||||
when :hilite
|
||||
print document.hilite
|
||||
when :tree
|
||||
|
@ -188,6 +188,10 @@ opts = OptionParser.new do |opts|
|
|||
options.serializer[:escape_lt_in_attrs] = lt
|
||||
end
|
||||
|
||||
opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
|
||||
options.serializer[:escape_rcdata] = rcdata
|
||||
end
|
||||
|
||||
opts.separator ""
|
||||
opts.separator "Other Options:"
|
||||
|
||||
|
|
|
@ -33,7 +33,6 @@ EUC-jp
|
|||
#encoding
|
||||
EUC-jp
|
||||
|
||||
|
||||
#data
|
||||
<!-- -->
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
|
|
|
@ -92,7 +92,8 @@
|
|||
|
||||
{"description": "rcdata",
|
||||
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
|
||||
"expected": ["<script>a<b>c&d"]
|
||||
"expected": ["<script>a<b>c&d"],
|
||||
"xhtml": ["<script>a<b>c&d"]
|
||||
},
|
||||
|
||||
{"description": "doctype",
|
||||
|
|
|
@ -49,6 +49,12 @@
|
|||
"options": {"escape_lt_in_attrs": true},
|
||||
"input": [["StartTag", "a", {"title": "a<b>c&d"}]],
|
||||
"expected": ["<a title=\"a<b>c&d\">"]
|
||||
},
|
||||
|
||||
{"description": "rcdata",
|
||||
"options": {"escape_rcdata": true},
|
||||
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
|
||||
"expected": ["<script>a<b>c&d"]
|
||||
}
|
||||
|
||||
]}
|
||||
|
|
|
@ -135,7 +135,7 @@
|
|||
|
||||
{"description":"Entity without trailing semicolon (2)",
|
||||
"input":"I'm ¬in",
|
||||
"output":[["Character","I'm "], "ParseError", ["Character", "∉"]]},
|
||||
"output":[["Character","I'm "], "ParseError", ["Character", "¬in"]]},
|
||||
|
||||
{"description":"Partial entity match at end of file",
|
||||
"input":"I'm &no",
|
||||
|
@ -151,6 +151,18 @@
|
|||
|
||||
{"description":"Hexadecimal entity in attribute",
|
||||
"input":"<h a='?'></h>",
|
||||
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]}
|
||||
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]},
|
||||
|
||||
{"description":"Entity in attribute without semicolon ending in x",
|
||||
"input":"<h a='¬x'>",
|
||||
"output":["ParseError", ["StartTag", "h", {"a":"¬x"}]]},
|
||||
|
||||
{"description":"Entity in attribute without semicolon ending in 1",
|
||||
"input":"<h a='¬1'>",
|
||||
"output":["ParseError", ["StartTag", "h", {"a":"¬1"}]]},
|
||||
|
||||
{"description":"Entity in attribute without semicolon",
|
||||
"input":"<h a='©'>",
|
||||
"output":["ParseError", ["StartTag", "h", {"a":"©"}]]}
|
||||
|
||||
]}
|
||||
|
|
|
@ -42,19 +42,23 @@
|
|||
|
||||
{"description":"Numeric entity representing the NUL character",
|
||||
"input":"�",
|
||||
"output":[["Character", "\uFFFD"]]},
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"Hexadecimal entity representing the NUL character",
|
||||
"input":"�",
|
||||
"output":[["Character", "\uFFFD"]]},
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
|
||||
"input":"�",
|
||||
"output":[["Character", "\uFFFD"]]},
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
|
||||
"input":"�",
|
||||
"output":[["Character", "\uFFFD"]]},
|
||||
"output":["ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"Hexadecimal entity pair representing a surrogate pair",
|
||||
"input":"��",
|
||||
"output":["ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"]]},
|
||||
|
||||
{"description":"Numeric entity representing a Windows-1252 'codepoint'",
|
||||
"input":"‰",
|
||||
|
@ -118,7 +122,7 @@
|
|||
|
||||
{"description":"Null Byte Replacement",
|
||||
"input":"\u0000",
|
||||
"output":[["Character", "\ufffd"]]}
|
||||
"output":["ParseError", ["Character", "\ufffd"]]}
|
||||
|
||||
]}
|
||||
|
||||
|
|
|
@ -285,6 +285,7 @@ Line1<br>Line2<br>Line3<br>Line4
|
|||
| <div>
|
||||
| <b>
|
||||
| <marquee>
|
||||
| <p>
|
||||
| "X"
|
||||
|
||||
#data
|
||||
|
@ -330,6 +331,7 @@ Unexpected end of file
|
|||
| <body>
|
||||
| <p>
|
||||
| <hr>
|
||||
| <p>
|
||||
|
||||
#data
|
||||
<select><b><option><select><option></b></select>X
|
||||
|
@ -1369,13 +1371,14 @@ unexpected EOF
|
|||
<head></p><meta><p>
|
||||
#errors
|
||||
6: missing document type declaration
|
||||
10: unexpected p element end tag in head
|
||||
10: unexpected p element end tag
|
||||
#document
|
||||
| <html>
|
||||
| <head>
|
||||
| <meta>
|
||||
| <body>
|
||||
| <p>
|
||||
| <meta>
|
||||
| <p>
|
||||
|
||||
#data
|
||||
<head></html><meta><p>
|
||||
|
@ -1485,6 +1488,7 @@ unexpected EOF
|
|||
| <div>
|
||||
| <b>
|
||||
| <marquee>
|
||||
| <p>
|
||||
|
||||
#data
|
||||
<script></script></div><title></title><p><p>
|
||||
|
@ -1511,6 +1515,7 @@ unexpected EOF
|
|||
| <body>
|
||||
| <p>
|
||||
| <hr>
|
||||
| <p>
|
||||
|
||||
#data
|
||||
<select><b><option><select><option></b></select>
|
||||
|
@ -1807,6 +1812,7 @@ Unexpected EOF
|
|||
| <head>
|
||||
| <body>
|
||||
| <br>
|
||||
| <p>
|
||||
|
||||
#data
|
||||
<table><tr></strong></b></em></i></u></strike></s></blink></tt></pre></big></small></font></select></h1></h2></h3></h4></h5></h6></body></br></a></img></title></span></style></script></table></th></td></tr></frame></area></link></param></hr></input></col></base></meta></basefont></bgsound></embed></spacer></p></dd></dt></caption></colgroup></tbody></tfoot></thead></address></blockquote></center></dir></div></dl></fieldset></listing></menu></ol></ul></li></nobr></wbr></form></button></marquee></object></html></frameset></head></iframe></image></isindex></noembed></noframes></noscript></optgroup></option></plaintext></textarea>
|
||||
|
@ -1928,3 +1934,4 @@ Unexpected EOF
|
|||
| <table>
|
||||
| <tbody>
|
||||
| <tr>
|
||||
| <p>
|
||||
|
|
|
@ -777,3 +777,4 @@ Unexpected </p> end tag.
|
|||
| <tbody>
|
||||
| <tr>
|
||||
| <td>
|
||||
| <p>
|
||||
|
|
|
@ -61,7 +61,6 @@ No DOCTYPE
|
|||
|
||||
#data
|
||||
<!DOCTYPE htML><html><head></head><body><pre>
|
||||
|
||||
foo</pre></body></html>
|
||||
#errors
|
||||
#document
|
||||
|
@ -72,10 +71,22 @@ foo</pre></body></html>
|
|||
| <pre>
|
||||
| "foo"
|
||||
|
||||
|
||||
#data
|
||||
<!DOCTYPE htML><html><head></head><body><pre>
|
||||
|
||||
foo</pre></body></html>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE htML>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <pre>
|
||||
| "
|
||||
foo"
|
||||
|
||||
#data
|
||||
<!DOCTYPE htML><html><head></head><body><pre>
|
||||
foo
|
||||
</pre></body></html>
|
||||
#errors
|
||||
|
@ -183,7 +194,6 @@ y</pre></body></html>
|
|||
|
||||
#data
|
||||
<!DOCTYPE htML><textarea>
|
||||
|
||||
foo</textarea>
|
||||
#errors
|
||||
#document
|
||||
|
@ -194,6 +204,20 @@ foo</textarea>
|
|||
| <textarea>
|
||||
| "foo"
|
||||
|
||||
#data
|
||||
<!DOCTYPE htML><textarea>
|
||||
|
||||
foo</textarea>
|
||||
#errors
|
||||
#document
|
||||
| <!DOCTYPE htML>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <textarea>
|
||||
| "
|
||||
foo"
|
||||
|
||||
#data
|
||||
<!DOCTYPE htML><html><head></head><body><ul><li><div><p><li></ul></body></html>
|
||||
#errors
|
||||
|
|
|
@ -1,37 +1,49 @@
|
|||
#data
|
||||
direct div content
|
||||
#errors
|
||||
#document-fragment div
|
||||
#document-fragment
|
||||
div
|
||||
#document
|
||||
| "direct div content"
|
||||
|
||||
#data
|
||||
direct textarea content
|
||||
#errors
|
||||
#document-fragment textarea
|
||||
#document-fragment
|
||||
textarea
|
||||
#document
|
||||
| "direct textarea content"
|
||||
|
||||
#data
|
||||
textarea content with <em>pseudo</em> <foo>markup
|
||||
#errors
|
||||
#document-fragment textarea
|
||||
#document-fragment
|
||||
textarea
|
||||
#document
|
||||
| "textarea content with <em>pseudo</em> <foo>markup"
|
||||
|
||||
#data
|
||||
this is CDATA inside a <style> element
|
||||
#errors
|
||||
#document-fragment style
|
||||
#document-fragment
|
||||
style
|
||||
#document
|
||||
| "this is CDATA inside a <style> element"
|
||||
|
||||
#data
|
||||
</plaintext>
|
||||
#errors
|
||||
#document-fragment plaintext
|
||||
#document-fragment
|
||||
plaintext
|
||||
#document
|
||||
| "</plaintext>"
|
||||
|
||||
#data
|
||||
setting html's innerHTML
|
||||
#errors
|
||||
#document-fragment html
|
||||
#document-fragment
|
||||
html
|
||||
#document
|
||||
| <head>
|
||||
| <body>
|
||||
| "setting html's innerHTML"
|
||||
|
@ -39,6 +51,8 @@ setting html's innerHTML
|
|||
#data
|
||||
<title>setting head's innerHTML</title>
|
||||
#errors
|
||||
#document-fragment head
|
||||
#document-fragment
|
||||
head
|
||||
#document
|
||||
| <title>
|
||||
| "setting head's innerHTML"
|
||||
|
|
|
@ -27,3 +27,41 @@
|
|||
| <head>
|
||||
| <body>
|
||||
| <meta>
|
||||
|
||||
#data
|
||||
<!doctype HTml><form><div></form><div>
|
||||
#errors
|
||||
Form end tag ignored.
|
||||
Unexpected end of file.
|
||||
#document
|
||||
| <!DOCTYPE HTml>
|
||||
| <html>
|
||||
| <head>
|
||||
| <body>
|
||||
| <form>
|
||||
| <div>
|
||||
| <div>
|
||||
|
||||
#data
|
||||
<!doctype HTml><title>&</title>
|
||||
#errors
|
||||
Unexpected end of file.
|
||||
#document
|
||||
| <!DOCTYPE HTml>
|
||||
| <html>
|
||||
| <head>
|
||||
| <title>
|
||||
| "&"
|
||||
| <body>
|
||||
|
||||
#data
|
||||
<!doctype HTml><title><!--&--></title>
|
||||
#errors
|
||||
Unexpected end of file.
|
||||
#document
|
||||
| <!DOCTYPE HTml>
|
||||
| <html>
|
||||
| <head>
|
||||
| <title>
|
||||
| "<!--&-->"
|
||||
| <body>
|
||||
|
|
162
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
162
vendor/plugins/HTML5lib/tests/preamble.rb
vendored
|
@ -1,81 +1,81 @@
|
|||
require 'test/unit'
|
||||
|
||||
HTML5LIB_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
|
||||
|
||||
if File.exists?(File.join(HTML5LIB_BASE, 'testdata'))
|
||||
TESTDATA_DIR = File.join(HTML5LIB_BASE, 'testdata')
|
||||
else
|
||||
TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
|
||||
end
|
||||
|
||||
$:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
|
||||
|
||||
$:.unshift File.dirname(__FILE__)
|
||||
|
||||
def html5lib_test_files(subdirectory)
|
||||
Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
|
||||
end
|
||||
|
||||
begin
|
||||
require 'rubygems'
|
||||
require 'json'
|
||||
rescue LoadError
|
||||
class JSON
|
||||
def self.parse json
|
||||
json.gsub!(/"\s*:/, '"=>')
|
||||
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
|
||||
null = nil
|
||||
eval json
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
module HTML5lib
|
||||
module TestSupport
|
||||
def self.startswith?(a, b)
|
||||
b[0... a.length] == a
|
||||
end
|
||||
|
||||
def self.parseTestcase(data)
|
||||
innerHTML = nil
|
||||
input = []
|
||||
output = []
|
||||
errors = []
|
||||
currentList = input
|
||||
data.split(/\n/).each do |line|
|
||||
if !line.empty? and !startswith?("#errors", line) and
|
||||
!startswith?("#document", line) and
|
||||
!startswith?("#data", line) and
|
||||
!startswith?("#document-fragment", line)
|
||||
|
||||
if currentList == output and startswith?("|", line)
|
||||
currentList.push(line[2..-1])
|
||||
else
|
||||
currentList.push(line)
|
||||
end
|
||||
elsif line == "#errors"
|
||||
currentList = errors
|
||||
elsif line == "#document" or startswith?("#document-fragment", line)
|
||||
if startswith?("#document-fragment", line)
|
||||
innerHTML = line[19..-1]
|
||||
raise AssertionError unless innerHTML
|
||||
end
|
||||
currentList = output
|
||||
end
|
||||
end
|
||||
return innerHTML, input.join("\n"), output.join("\n"), errors
|
||||
end
|
||||
|
||||
# convert the output of str(document) to the format used in the testcases
|
||||
def convertTreeDump(treedump)
|
||||
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
|
||||
end
|
||||
|
||||
def sortattrs(output)
|
||||
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
|
||||
match.split("\n").sort.join("\n")
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
require 'test/unit'
|
||||
|
||||
HTML5_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
|
||||
|
||||
if File.exists?(File.join(HTML5_BASE, 'testdata'))
|
||||
TESTDATA_DIR = File.join(HTML5_BASE, 'testdata')
|
||||
else
|
||||
TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
|
||||
end
|
||||
|
||||
$:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
|
||||
|
||||
$:.unshift File.dirname(__FILE__)
|
||||
|
||||
def html5_test_files(subdirectory)
|
||||
Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
|
||||
end
|
||||
|
||||
begin
|
||||
require 'rubygems'
|
||||
require 'json'
|
||||
rescue LoadError
|
||||
class JSON
|
||||
def self.parse json
|
||||
json.gsub!(/"\s*:/, '"=>')
|
||||
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
|
||||
null = nil
|
||||
eval json
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
module HTML5
|
||||
module TestSupport
|
||||
# convert the output of str(document) to the format used in the testcases
|
||||
def convertTreeDump(treedump)
|
||||
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
|
||||
end
|
||||
|
||||
def sortattrs(output)
|
||||
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
|
||||
match.split("\n").sort.join("\n")
|
||||
end
|
||||
end
|
||||
|
||||
class TestData
|
||||
include Enumerable
|
||||
|
||||
def initialize(filename, sections)
|
||||
@f = open(filename)
|
||||
@sections = sections
|
||||
end
|
||||
|
||||
def each
|
||||
data = {}
|
||||
key=nil
|
||||
@f.each_line do |line|
|
||||
if line[0] == ?# and @sections.include?(line[1..-2])
|
||||
heading = line[1..-2]
|
||||
if data.any? and heading == @sections[0]
|
||||
data[key].chomp! #Remove trailing newline
|
||||
yield normaliseOutput(data)
|
||||
data = {}
|
||||
end
|
||||
key = heading
|
||||
data[key]=""
|
||||
elsif key
|
||||
data[key] += line
|
||||
end
|
||||
end
|
||||
yield normaliseOutput(data) if data
|
||||
end
|
||||
|
||||
def normaliseOutput(data)
|
||||
#Remove trailing newlines
|
||||
data.keys.each { |key| data[key].chomp! }
|
||||
@sections.map {|heading| data[heading]}
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
16
vendor/plugins/HTML5lib/tests/test_encoding.rb
vendored
16
vendor/plugins/HTML5lib/tests/test_encoding.rb
vendored
|
@ -1,8 +1,10 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/inputstream'
|
||||
require 'html5/inputstream'
|
||||
|
||||
class Html5EncodingTestCase < Test::Unit::TestCase
|
||||
include HTML5
|
||||
include TestSupport
|
||||
|
||||
begin
|
||||
require 'rubygems'
|
||||
|
@ -10,23 +12,21 @@ class Html5EncodingTestCase < Test::Unit::TestCase
|
|||
|
||||
def test_chardet
|
||||
file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r')
|
||||
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
|
||||
stream = HTML5::HTMLInputStream.new(file, :chardet => true)
|
||||
assert_equal 'big5', stream.char_encoding.downcase
|
||||
rescue LoadError
|
||||
puts "chardet not found, skipping chardet tests"
|
||||
end
|
||||
end
|
||||
|
||||
html5lib_test_files('encoding').each do |test_file|
|
||||
html5_test_files('encoding').each do |test_file|
|
||||
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
|
||||
|
||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||
next if data.empty?
|
||||
input, encoding = data.split(/\n#encoding\s+/, 2)
|
||||
encoding = encoding.split[0]
|
||||
TestData.new(test_file, %w(data encoding)).
|
||||
each_with_index do |(input, encoding), index|
|
||||
|
||||
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
|
||||
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
|
||||
stream = HTML5::HTMLInputStream.new(input, :chardet => false)
|
||||
assert_equal encoding.downcase, stream.char_encoding.downcase, input
|
||||
end
|
||||
end
|
||||
|
|
75
vendor/plugins/HTML5lib/tests/test_lxp.rb
vendored
75
vendor/plugins/HTML5lib/tests/test_lxp.rb
vendored
|
@ -1,23 +1,23 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/liberalxmlparser'
|
||||
require 'html5/liberalxmlparser'
|
||||
|
||||
XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
|
||||
SORTATTRS = '<#{$1+$2.split.sort.join(' ')+$3}>'
|
||||
|
||||
def assert_xml_equal(input, expected=nil, parser=HTML5lib::XMLParser)
|
||||
def assert_xml_equal(input, expected=nil, parser=HTML5::XMLParser)
|
||||
sortattrs = proc {"<#{$1+$2.split.sort.join(' ')+$3}>"}
|
||||
document = parser.parse(input.chomp).root
|
||||
if not expected
|
||||
expected = input.chomp.gsub(XMLELEM,SORTATTRS)
|
||||
expected = input.chomp.gsub(XMLELEM,&sortattrs)
|
||||
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
|
||||
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
|
||||
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,&sortattrs)
|
||||
assert_equal(expected, output)
|
||||
else
|
||||
assert_equal(expected, document.to_s.gsub(/'/,'"'))
|
||||
end
|
||||
end
|
||||
|
||||
def assert_xhtml_equal(input, expected=nil, parser=HTML5lib::XHTMLParser)
|
||||
def assert_xhtml_equal(input, expected=nil, parser=HTML5::XHTMLParser)
|
||||
assert_xml_equal(input, expected, parser)
|
||||
end
|
||||
|
||||
|
@ -34,10 +34,10 @@ class BasicXhtml5Test < Test::Unit::TestCase
|
|||
|
||||
def test_title_body_named_charref
|
||||
assert_xhtml_equal(
|
||||
'<title>mdash</title>A &mdash B',
|
||||
'<title>ntilde</title>A ñ B',
|
||||
'<html xmlns="http://www.w3.org/1999/xhtml">' +
|
||||
'<head><title>mdash</title></head>' +
|
||||
'<body>A '+ [0x2014].pack('U') + ' B</body>' +
|
||||
'<head><title>ntilde</title></head>' +
|
||||
'<body>A '+ [0xF1].pack('U') + ' B</body>' +
|
||||
'</html>')
|
||||
end
|
||||
end
|
||||
|
@ -193,20 +193,71 @@ EOX
|
|||
def test_br
|
||||
assert_xhtml_equal <<EOX1
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>XLINK</title></head>
|
||||
<head><title>BR</title></head>
|
||||
<body>
|
||||
<br/>
|
||||
</body></html>
|
||||
EOX1
|
||||
end
|
||||
|
||||
def xtest_strong
|
||||
def test_strong
|
||||
assert_xhtml_equal <<EOX
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>XLINK</title></head>
|
||||
<head><title>STRONG</title></head>
|
||||
<body>
|
||||
<strong></strong>
|
||||
</body></html>
|
||||
EOX
|
||||
end
|
||||
|
||||
def test_script
|
||||
assert_xhtml_equal <<EOX
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>SCRIPT</title></head>
|
||||
<body>
|
||||
<script>1 < 2 & 3</script>
|
||||
</body></html>
|
||||
EOX
|
||||
end
|
||||
|
||||
def test_script_src
|
||||
assert_xhtml_equal <<EOX1, <<EOX2.strip
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>SCRIPT</title><script src="http://example.com"/></head>
|
||||
<body>
|
||||
<script>1 < 2 & 3</script>
|
||||
</body></html>
|
||||
EOX1
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>SCRIPT</title><script src="http://example.com"></script></head>
|
||||
<body>
|
||||
<script>1 < 2 & 3</script>
|
||||
</body></html>
|
||||
EOX2
|
||||
end
|
||||
|
||||
def test_title
|
||||
assert_xhtml_equal <<EOX
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>1 < 2 & 3</title></head>
|
||||
<body>
|
||||
</body></html>
|
||||
EOX
|
||||
end
|
||||
|
||||
def test_prolog
|
||||
assert_xhtml_equal <<EOX1, <<EOX2.strip
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>PROLOG</title></head>
|
||||
<body>
|
||||
</body></html>
|
||||
EOX1
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>PROLOG</title></head>
|
||||
<body>
|
||||
</body></html>
|
||||
EOX2
|
||||
end
|
||||
|
||||
end
|
||||
|
|
25
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
25
vendor/plugins/HTML5lib/tests/test_parser.rb
vendored
|
@ -1,7 +1,7 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/treebuilders'
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5/treebuilders'
|
||||
require 'html5/html5parser'
|
||||
|
||||
|
||||
$tree_types_to_test = ['simpletree', 'rexml']
|
||||
|
@ -18,18 +18,17 @@ puts 'Testing tree builders: ' + $tree_types_to_test * ', '
|
|||
|
||||
|
||||
class Html5ParserTestCase < Test::Unit::TestCase
|
||||
include HTML5lib
|
||||
include HTML5
|
||||
include TestSupport
|
||||
|
||||
html5lib_test_files('tree-construction').each do |test_file|
|
||||
html5_test_files('tree-construction').each do |test_file|
|
||||
|
||||
test_name = File.basename(test_file).sub('.dat', '')
|
||||
|
||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||
next if data.empty?
|
||||
|
||||
innerHTML, input, expected_output, expected_errors =
|
||||
TestSupport.parseTestcase(data)
|
||||
TestData.new(test_file, %w(data errors document-fragment document)).
|
||||
each_with_index do |(input, errors, innerHTML, expected), index|
|
||||
|
||||
expected = expected.gsub("\n| ","\n")[2..-1]
|
||||
|
||||
$tree_types_to_test.each do |tree_name|
|
||||
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
|
||||
|
@ -44,9 +43,9 @@ class Html5ParserTestCase < Test::Unit::TestCase
|
|||
|
||||
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
|
||||
|
||||
assert_equal sortattrs(expected_output), sortattrs(actual_output), [
|
||||
assert_equal sortattrs(expected), sortattrs(actual_output), [
|
||||
'', 'Input:', input,
|
||||
'', 'Expected:', expected_output,
|
||||
'', 'Expected:', expected,
|
||||
'', 'Recieved:', actual_output
|
||||
].join("\n")
|
||||
|
||||
|
@ -54,9 +53,9 @@ class Html5ParserTestCase < Test::Unit::TestCase
|
|||
actual_errors = parser.errors.map do |(line, col), message|
|
||||
'Line: %i Col: %i %s' % [line, col, message]
|
||||
end
|
||||
assert_equal expected_errors.length, parser.errors.length, [
|
||||
assert_equal errors.length, parser.errors.length, [
|
||||
'Input', input + "\n",
|
||||
'Expected errors:', expected_errors.join("\n"),
|
||||
'Expected errors:', errors.join("\n"),
|
||||
'Actual errors:', actual_errors.join("\n")
|
||||
].join("\n")
|
||||
end
|
||||
|
|
14
vendor/plugins/HTML5lib/tests/test_sanitizer.rb
vendored
14
vendor/plugins/HTML5lib/tests/test_sanitizer.rb
vendored
|
@ -2,14 +2,14 @@
|
|||
|
||||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5lib/liberalxmlparser'
|
||||
require 'html5lib/treewalkers'
|
||||
require 'html5lib/serializer'
|
||||
require 'html5lib/sanitizer'
|
||||
require 'html5/html5parser'
|
||||
require 'html5/liberalxmlparser'
|
||||
require 'html5/treewalkers'
|
||||
require 'html5/serializer'
|
||||
require 'html5/sanitizer'
|
||||
|
||||
class SanitizeTest < Test::Unit::TestCase
|
||||
include HTML5lib
|
||||
include HTML5
|
||||
|
||||
def sanitize_xhtml stream
|
||||
XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).to_s
|
||||
|
@ -131,7 +131,7 @@ class SanitizeTest < Test::Unit::TestCase
|
|||
# check_sanitization(input, output, output, output)
|
||||
# end
|
||||
|
||||
html5lib_test_files('sanitizer').each do |filename|
|
||||
html5_test_files('sanitizer').each do |filename|
|
||||
JSON::parse(open(filename).read).each do |test|
|
||||
define_method "test_#{test['name']}" do
|
||||
check_sanitization(
|
||||
|
|
14
vendor/plugins/HTML5lib/tests/test_serializer.rb
vendored
14
vendor/plugins/HTML5lib/tests/test_serializer.rb
vendored
|
@ -1,13 +1,13 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5lib/serializer'
|
||||
require 'html5lib/treewalkers'
|
||||
require 'html5/html5parser'
|
||||
require 'html5/serializer'
|
||||
require 'html5/treewalkers'
|
||||
|
||||
#Run the serialize error checks
|
||||
checkSerializeErrors = false
|
||||
|
||||
class JsonWalker < HTML5lib::TreeWalkers::Base
|
||||
class JsonWalker < HTML5::TreeWalkers::Base
|
||||
def each
|
||||
@tree.each do |token|
|
||||
case token[0]
|
||||
|
@ -31,7 +31,7 @@ class JsonWalker < HTML5lib::TreeWalkers::Base
|
|||
end
|
||||
|
||||
class Html5SerializeTestcase < Test::Unit::TestCase
|
||||
html5lib_test_files('serializer').each do |filename|
|
||||
html5_test_files('serializer').each do |filename|
|
||||
test_name = File.basename(filename).sub('.test', '')
|
||||
tests = JSON::parse(open(filename).read)
|
||||
tests['tests'].each_with_index do |test, index|
|
||||
|
@ -41,7 +41,7 @@ class Html5SerializeTestcase < Test::Unit::TestCase
|
|||
test["options"][:encoding] = test["options"]["encoding"]
|
||||
end
|
||||
|
||||
result = HTML5lib::HTMLSerializer.
|
||||
result = HTML5::HTMLSerializer.
|
||||
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
||||
expected = test["expected"]
|
||||
if expected.length == 1
|
||||
|
@ -52,7 +52,7 @@ class Html5SerializeTestcase < Test::Unit::TestCase
|
|||
|
||||
return if test_name == 'optionaltags'
|
||||
|
||||
result = HTML5lib::XHTMLSerializer.
|
||||
result = HTML5::XHTMLSerializer.
|
||||
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
||||
expected = test["xhtml"] || test["expected"]
|
||||
if expected.length == 1
|
||||
|
|
4
vendor/plugins/HTML5lib/tests/test_stream.rb
vendored
4
vendor/plugins/HTML5lib/tests/test_stream.rb
vendored
|
@ -1,9 +1,9 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/inputstream'
|
||||
require 'html5/inputstream'
|
||||
|
||||
class HTMLInputStreamTest < Test::Unit::TestCase
|
||||
include HTML5lib
|
||||
include HTML5
|
||||
|
||||
def test_char_ascii
|
||||
stream = HTMLInputStream.new("'", :encoding=>'ascii')
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/tokenizer'
|
||||
require 'html5/tokenizer'
|
||||
|
||||
require 'tokenizer_test_parser'
|
||||
|
||||
|
@ -36,7 +36,7 @@ class Html5TokenizerTestCase < Test::Unit::TestCase
|
|||
'' ] * "\n"
|
||||
|
||||
assert_nothing_raised message do
|
||||
tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
|
||||
tokenizer = HTML5::HTMLTokenizer.new(data['input'])
|
||||
|
||||
tokenizer.contentModelFlag = content_model_flag.to_sym
|
||||
|
||||
|
@ -53,7 +53,7 @@ class Html5TokenizerTestCase < Test::Unit::TestCase
|
|||
end
|
||||
end
|
||||
|
||||
html5lib_test_files('tokenizer').each do |test_file|
|
||||
html5_test_files('tokenizer').each do |test_file|
|
||||
test_name = File.basename(test_file).sub('.test', '')
|
||||
|
||||
tests = JSON.parse(File.read(test_file))['tests']
|
||||
|
|
|
@ -1,25 +1,25 @@
|
|||
require File.join(File.dirname(__FILE__), 'preamble')
|
||||
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5lib/treewalkers'
|
||||
require 'html5lib/treebuilders'
|
||||
require 'html5/html5parser'
|
||||
require 'html5/treewalkers'
|
||||
require 'html5/treebuilders'
|
||||
|
||||
$tree_types_to_test = {
|
||||
'simpletree' =>
|
||||
{:builder => HTML5lib::TreeBuilders['simpletree'],
|
||||
:walker => HTML5lib::TreeWalkers['simpletree']},
|
||||
{:builder => HTML5::TreeBuilders['simpletree'],
|
||||
:walker => HTML5::TreeWalkers['simpletree']},
|
||||
'rexml' =>
|
||||
{:builder => HTML5lib::TreeBuilders['rexml'],
|
||||
:walker => HTML5lib::TreeWalkers['rexml']},
|
||||
{:builder => HTML5::TreeBuilders['rexml'],
|
||||
:walker => HTML5::TreeWalkers['rexml']},
|
||||
'hpricot' =>
|
||||
{:builder => HTML5lib::TreeBuilders['hpricot'],
|
||||
:walker => HTML5lib::TreeWalkers['hpricot']},
|
||||
{:builder => HTML5::TreeBuilders['hpricot'],
|
||||
:walker => HTML5::TreeWalkers['hpricot']},
|
||||
}
|
||||
|
||||
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
|
||||
|
||||
class TestTreeWalkers < Test::Unit::TestCase
|
||||
include HTML5lib::TestSupport
|
||||
include HTML5::TestSupport
|
||||
|
||||
def concatenateCharacterTokens(tokens)
|
||||
charactersToken = nil
|
||||
|
@ -70,22 +70,21 @@ class TestTreeWalkers < Test::Unit::TestCase
|
|||
return output.join("\n")
|
||||
end
|
||||
|
||||
html5lib_test_files('tree-construction').each do |test_file|
|
||||
html5_test_files('tree-construction').each do |test_file|
|
||||
|
||||
test_name = File.basename(test_file).sub('.dat', '')
|
||||
next if test_name == 'tests5' # TODO
|
||||
|
||||
File.read(test_file).split("#data\n").each_with_index do |data, index|
|
||||
next if data.empty?
|
||||
TestData.new(test_file, %w(data errors document-fragment document)).
|
||||
each_with_index do |(input, errors, innerHTML, expected), index|
|
||||
|
||||
innerHTML, input, expected_output, expected_errors =
|
||||
HTML5lib::TestSupport::parseTestcase(data)
|
||||
expected = expected.gsub("\n| ","\n")[2..-1]
|
||||
|
||||
$tree_types_to_test.each do |tree_name, tree_class|
|
||||
|
||||
define_method "test_#{test_name}_#{index}_#{tree_name}" do
|
||||
|
||||
parser = HTML5lib::HTMLParser.new(:tree => tree_class[:builder])
|
||||
parser = HTML5::HTMLParser.new(:tree => tree_class[:builder])
|
||||
|
||||
if innerHTML
|
||||
parser.parseFragment(input, innerHTML)
|
||||
|
@ -97,7 +96,7 @@ class TestTreeWalkers < Test::Unit::TestCase
|
|||
|
||||
begin
|
||||
output = sortattrs(convertTokens(tree_class[:walker].new(document)))
|
||||
expected = sortattrs(expected_output)
|
||||
expected = sortattrs(expected)
|
||||
assert_equal expected, output, [
|
||||
'', 'Input:', input,
|
||||
'', 'Expected:', expected,
|
||||
|
|
|
@ -1,63 +1,63 @@
|
|||
require 'html5lib/constants'
|
||||
|
||||
class TokenizerTestParser
|
||||
def initialize(tokenizer)
|
||||
@tokenizer = tokenizer
|
||||
end
|
||||
|
||||
def parse
|
||||
@outputTokens = []
|
||||
|
||||
debug = nil
|
||||
for token in @tokenizer
|
||||
debug = token.inspect if token[:type] == :ParseError
|
||||
send(('process' + token[:type].to_s), token)
|
||||
end
|
||||
|
||||
return @outputTokens
|
||||
end
|
||||
|
||||
def processDoctype(token)
|
||||
@outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
|
||||
token[:systemId], token[:correct]])
|
||||
end
|
||||
|
||||
def processStartTag(token)
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
|
||||
def processEmptyTag(token)
|
||||
if not HTML5lib::VOID_ELEMENTS.include? token[:name]
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
|
||||
def processEndTag(token)
|
||||
if token[:data].length > 0
|
||||
self.processParseError(token)
|
||||
end
|
||||
@outputTokens.push(["EndTag", token[:name]])
|
||||
end
|
||||
|
||||
def processComment(token)
|
||||
@outputTokens.push(["Comment", token[:data]])
|
||||
end
|
||||
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
|
||||
alias processSpaceCharacters processCharacters
|
||||
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
|
||||
def processEOF(token)
|
||||
end
|
||||
|
||||
def processParseError(token)
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
end
|
||||
require 'html5/constants'
|
||||
|
||||
class TokenizerTestParser
|
||||
def initialize(tokenizer)
|
||||
@tokenizer = tokenizer
|
||||
end
|
||||
|
||||
def parse
|
||||
@outputTokens = []
|
||||
|
||||
debug = nil
|
||||
for token in @tokenizer
|
||||
debug = token.inspect if token[:type] == :ParseError
|
||||
send(('process' + token[:type].to_s), token)
|
||||
end
|
||||
|
||||
return @outputTokens
|
||||
end
|
||||
|
||||
def processDoctype(token)
|
||||
@outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
|
||||
token[:systemId], token[:correct]])
|
||||
end
|
||||
|
||||
def processStartTag(token)
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
|
||||
def processEmptyTag(token)
|
||||
if not HTML5::VOID_ELEMENTS.include? token[:name]
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
|
||||
def processEndTag(token)
|
||||
if token[:data].length > 0
|
||||
self.processParseError(token)
|
||||
end
|
||||
@outputTokens.push(["EndTag", token[:name]])
|
||||
end
|
||||
|
||||
def processComment(token)
|
||||
@outputTokens.push(["Comment", token[:data]])
|
||||
end
|
||||
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
|
||||
alias processSpaceCharacters processCharacters
|
||||
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
|
||||
def processEOF(token)
|
||||
end
|
||||
|
||||
def processParseError(token)
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
end
|
||||
|
|
|
@ -28,6 +28,7 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
|
|||
|
||||
class LineSource
|
||||
include MaRuKu::Strings
|
||||
attr_reader :parent
|
||||
|
||||
def initialize(lines, parent=nil, parent_offset=nil)
|
||||
raise "NIL lines? " if not lines
|
||||
|
|
|
@ -65,22 +65,8 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
|
|||
when :ald
|
||||
output.push read_ald(src)
|
||||
when :text
|
||||
if src.cur_line =~ MightBeTableHeader and
|
||||
(src.next_line && src.next_line =~ TableSeparator)
|
||||
output.push read_table(src)
|
||||
elsif [:header1,:header2].include? src.next_line.md_type
|
||||
output.push read_header12(src)
|
||||
elsif eventually_comes_a_def_list(src)
|
||||
definition = read_definition(src)
|
||||
if output.last.kind_of?(MDElement) &&
|
||||
output.last.node_type == :definition_list then
|
||||
output.last.children << definition
|
||||
else
|
||||
output.push md_el(:definition_list, [definition])
|
||||
end
|
||||
else # Start of a paragraph
|
||||
output.push read_paragraph(src)
|
||||
end
|
||||
# paragraph, or table, or definition list
|
||||
read_text_material(src, output)
|
||||
when :header2, :hrule
|
||||
# hrule
|
||||
src.shift_line
|
||||
|
@ -102,7 +88,12 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
|
|||
when :raw_html; e = read_raw_html(src); output << e if e
|
||||
|
||||
when :footnote_text; output.push read_footnote_text(src)
|
||||
when :ref_definition; read_ref_definition(src, output)
|
||||
when :ref_definition;
|
||||
if src.parent && (src.cur_index == 0)
|
||||
read_text_material(src, output)
|
||||
else
|
||||
read_ref_definition(src, output)
|
||||
end
|
||||
when :abbreviation; output.push read_abbreviation(src)
|
||||
when :xml_instr; read_xml_instruction(src, output)
|
||||
when :metadata;
|
||||
|
@ -149,6 +140,24 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
|
|||
output
|
||||
end
|
||||
|
||||
def read_text_material(src, output)
|
||||
if src.cur_line =~ MightBeTableHeader and
|
||||
(src.next_line && src.next_line =~ TableSeparator)
|
||||
output.push read_table(src)
|
||||
elsif [:header1,:header2].include? src.next_line.md_type
|
||||
output.push read_header12(src)
|
||||
elsif eventually_comes_a_def_list(src)
|
||||
definition = read_definition(src)
|
||||
if output.last.kind_of?(MDElement) &&
|
||||
output.last.node_type == :definition_list then
|
||||
output.last.children << definition
|
||||
else
|
||||
output.push md_el(:definition_list, [definition])
|
||||
end
|
||||
else # Start of a paragraph
|
||||
output.push read_paragraph(src)
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
def read_ald(src)
|
||||
|
@ -274,9 +283,9 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
|
|||
item_type = src.cur_line.md_type
|
||||
first = src.shift_line
|
||||
|
||||
# Ugly things going on inside `read_indented_content`
|
||||
indentation = spaces_before_first_char(first)
|
||||
break_list = [:ulist, :olist, :ial]
|
||||
# Ugly things going on inside `read_indented_content`
|
||||
lines, want_my_paragraph =
|
||||
read_indented_content(src,indentation, break_list, item_type)
|
||||
|
||||
|
@ -285,7 +294,7 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
|
|||
stripped = first[indentation, first.size-1]
|
||||
lines.unshift stripped
|
||||
|
||||
#dbg_describe_ary(lines, 'LIST ITEM ')
|
||||
# dbg_describe_ary(lines, 'LIST ITEM ')
|
||||
|
||||
src2 = LineSource.new(lines, src, parent_offset)
|
||||
children = parse_blocks(src2)
|
||||
|
|
Loading…
Reference in a new issue