HTML5lib Sanitizer
Replaced native Sanitizer with HTML5lib version. Synced with latest Maruku.
This commit is contained in:
parent
457ec8627c
commit
6b21ac484f
36 changed files with 6534 additions and 215 deletions
11
vendor/plugins/HTML5lib/lib/html5lib.rb
vendored
Normal file
11
vendor/plugins/HTML5lib/lib/html5lib.rb
vendored
Normal file
|
@ -0,0 +1,11 @@
|
|||
require 'html5lib/html5parser'
|
||||
|
||||
module HTML5lib
|
||||
def self.parse(stream, options={})
|
||||
HTMLParser.parse(stream, options)
|
||||
end
|
||||
|
||||
def self.parseFragment(stream, options={})
|
||||
HTMLParser.parse(stream, options)
|
||||
end
|
||||
end
|
676
vendor/plugins/HTML5lib/lib/html5lib/constants.rb
vendored
Executable file
676
vendor/plugins/HTML5lib/lib/html5lib/constants.rb
vendored
Executable file
|
@ -0,0 +1,676 @@
|
|||
module HTML5lib
|
||||
|
||||
class EOF < Exception; end
|
||||
|
||||
CONTENT_MODEL_FLAGS = [
|
||||
:PCDATA,
|
||||
:RCDATA,
|
||||
:CDATA,
|
||||
:PLAINTEXT
|
||||
]
|
||||
|
||||
SCOPING_ELEMENTS = %w[
|
||||
button
|
||||
caption
|
||||
html
|
||||
marquee
|
||||
object
|
||||
table
|
||||
td
|
||||
th
|
||||
]
|
||||
|
||||
FORMATTING_ELEMENTS = %w[
|
||||
a
|
||||
b
|
||||
big
|
||||
em
|
||||
font
|
||||
i
|
||||
nobr
|
||||
s
|
||||
small
|
||||
strike
|
||||
strong
|
||||
tt
|
||||
u
|
||||
]
|
||||
|
||||
SPECIAL_ELEMENTS = %w[
|
||||
address
|
||||
area
|
||||
base
|
||||
basefont
|
||||
bgsound
|
||||
blockquote
|
||||
body
|
||||
br
|
||||
center
|
||||
col
|
||||
colgroup
|
||||
dd
|
||||
dir
|
||||
div
|
||||
dl
|
||||
dt
|
||||
embed
|
||||
fieldset
|
||||
form
|
||||
frame
|
||||
frameset
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
h4
|
||||
h5
|
||||
h6
|
||||
head
|
||||
hr
|
||||
iframe
|
||||
image
|
||||
img
|
||||
input
|
||||
isindex
|
||||
li
|
||||
link
|
||||
listing
|
||||
menu
|
||||
meta
|
||||
noembed
|
||||
noframes
|
||||
noscript
|
||||
ol
|
||||
optgroup
|
||||
option
|
||||
p
|
||||
param
|
||||
plaintext
|
||||
pre
|
||||
script
|
||||
select
|
||||
spacer
|
||||
style
|
||||
tbody
|
||||
textarea
|
||||
tfoot
|
||||
thead
|
||||
title
|
||||
tr
|
||||
ul
|
||||
wbr
|
||||
]
|
||||
|
||||
SPACE_CHARACTERS = %W[
|
||||
\t
|
||||
\n
|
||||
\x0B
|
||||
\x0C
|
||||
\x20
|
||||
\r
|
||||
]
|
||||
|
||||
TABLE_INSERT_MODE_ELEMENTS = %w[
|
||||
table
|
||||
tbody
|
||||
tfoot
|
||||
thead
|
||||
tr
|
||||
]
|
||||
|
||||
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
|
||||
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
|
||||
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
|
||||
DIGITS = '0'..'9'
|
||||
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
|
||||
|
||||
# Heading elements need to be ordered
|
||||
HEADING_ELEMENTS = %w[
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
h4
|
||||
h5
|
||||
h6
|
||||
]
|
||||
|
||||
# XXX What about event-source and command?
|
||||
VOID_ELEMENTS = %w[
|
||||
base
|
||||
link
|
||||
meta
|
||||
hr
|
||||
br
|
||||
img
|
||||
embed
|
||||
param
|
||||
area
|
||||
col
|
||||
input
|
||||
]
|
||||
|
||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
|
||||
ENTITIES_WINDOWS1252 = [
|
||||
8364, # 0x80 0x20AC EURO SIGN
|
||||
65533, # 0x81 UNDEFINED
|
||||
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
||||
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
||||
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
||||
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
||||
8224, # 0x86 0x2020 DAGGER
|
||||
8225, # 0x87 0x2021 DOUBLE DAGGER
|
||||
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||
8240, # 0x89 0x2030 PER MILLE SIGN
|
||||
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
||||
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
||||
65533, # 0x8D UNDEFINED
|
||||
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
||||
65533, # 0x8F UNDEFINED
|
||||
65533, # 0x90 UNDEFINED
|
||||
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
||||
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
||||
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
||||
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
||||
8226, # 0x95 0x2022 BULLET
|
||||
8211, # 0x96 0x2013 EN DASH
|
||||
8212, # 0x97 0x2014 EM DASH
|
||||
732, # 0x98 0x02DC SMALL TILDE
|
||||
8482, # 0x99 0x2122 TRADE MARK SIGN
|
||||
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
||||
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
||||
65533, # 0x9D UNDEFINED
|
||||
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
||||
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
]
|
||||
|
||||
private
|
||||
|
||||
def self.U n
|
||||
[n].pack('U')
|
||||
end
|
||||
|
||||
public
|
||||
|
||||
ENTITIES = {
|
||||
"AElig" => U(0xC6),
|
||||
"Aacute" => U(0xC1),
|
||||
"Acirc" => U(0xC2),
|
||||
"Agrave" => U(0xC0),
|
||||
"Alpha" => U(0x0391),
|
||||
"Aring" => U(0xC5),
|
||||
"Atilde" => U(0xC3),
|
||||
"Auml" => U(0xC4),
|
||||
"Beta" => U(0x0392),
|
||||
"Ccedil" => U(0xC7),
|
||||
"Chi" => U(0x03A7),
|
||||
"Dagger" => U(0x2021),
|
||||
"Delta" => U(0x0394),
|
||||
"ETH" => U(0xD0),
|
||||
"Eacute" => U(0xC9),
|
||||
"Ecirc" => U(0xCA),
|
||||
"Egrave" => U(0xC8),
|
||||
"Epsilon" => U(0x0395),
|
||||
"Eta" => U(0x0397),
|
||||
"Euml" => U(0xCB),
|
||||
"Gamma" => U(0x0393),
|
||||
"Iacute" => U(0xCD),
|
||||
"Icirc" => U(0xCE),
|
||||
"Igrave" => U(0xCC),
|
||||
"Iota" => U(0x0399),
|
||||
"Iuml" => U(0xCF),
|
||||
"Kappa" => U(0x039A),
|
||||
"Lambda" => U(0x039B),
|
||||
"Mu" => U(0x039C),
|
||||
"Ntilde" => U(0xD1),
|
||||
"Nu" => U(0x039D),
|
||||
"OElig" => U(0x0152),
|
||||
"Oacute" => U(0xD3),
|
||||
"Ocirc" => U(0xD4),
|
||||
"Ograve" => U(0xD2),
|
||||
"Omega" => U(0x03A9),
|
||||
"Omicron" => U(0x039F),
|
||||
"Oslash" => U(0xD8),
|
||||
"Otilde" => U(0xD5),
|
||||
"Ouml" => U(0xD6),
|
||||
"Phi" => U(0x03A6),
|
||||
"Pi" => U(0x03A0),
|
||||
"Prime" => U(0x2033),
|
||||
"Psi" => U(0x03A8),
|
||||
"Rho" => U(0x03A1),
|
||||
"Scaron" => U(0x0160),
|
||||
"Sigma" => U(0x03A3),
|
||||
"THORN" => U(0xDE),
|
||||
"Tau" => U(0x03A4),
|
||||
"Theta" => U(0x0398),
|
||||
"Uacute" => U(0xDA),
|
||||
"Ucirc" => U(0xDB),
|
||||
"Ugrave" => U(0xD9),
|
||||
"Upsilon" => U(0x03A5),
|
||||
"Uuml" => U(0xDC),
|
||||
"Xi" => U(0x039E),
|
||||
"Yacute" => U(0xDD),
|
||||
"Yuml" => U(0x0178),
|
||||
"Zeta" => U(0x0396),
|
||||
"aacute" => U(0xE1),
|
||||
"acirc" => U(0xE2),
|
||||
"acute" => U(0xB4),
|
||||
"aelig" => U(0xE6),
|
||||
"agrave" => U(0xE0),
|
||||
"alefsym" => U(0x2135),
|
||||
"alpha" => U(0x03B1),
|
||||
"amp" => U(0x26),
|
||||
"AMP" => U(0x26),
|
||||
"and" => U(0x2227),
|
||||
"ang" => U(0x2220),
|
||||
"apos" => U(0x27),
|
||||
"aring" => U(0xE5),
|
||||
"asymp" => U(0x2248),
|
||||
"atilde" => U(0xE3),
|
||||
"auml" => U(0xE4),
|
||||
"bdquo" => U(0x201E),
|
||||
"beta" => U(0x03B2),
|
||||
"brvbar" => U(0xA6),
|
||||
"bull" => U(0x2022),
|
||||
"cap" => U(0x2229),
|
||||
"ccedil" => U(0xE7),
|
||||
"cedil" => U(0xB8),
|
||||
"cent" => U(0xA2),
|
||||
"chi" => U(0x03C7),
|
||||
"circ" => U(0x02C6),
|
||||
"clubs" => U(0x2663),
|
||||
"cong" => U(0x2245),
|
||||
"copy" => U(0xA9),
|
||||
"COPY" => U(0xA9),
|
||||
"crarr" => U(0x21B5),
|
||||
"cup" => U(0x222A),
|
||||
"curren" => U(0xA4),
|
||||
"dArr" => U(0x21D3),
|
||||
"dagger" => U(0x2020),
|
||||
"darr" => U(0x2193),
|
||||
"deg" => U(0xB0),
|
||||
"delta" => U(0x03B4),
|
||||
"diams" => U(0x2666),
|
||||
"divide" => U(0xF7),
|
||||
"eacute" => U(0xE9),
|
||||
"ecirc" => U(0xEA),
|
||||
"egrave" => U(0xE8),
|
||||
"empty" => U(0x2205),
|
||||
"emsp" => U(0x2003),
|
||||
"ensp" => U(0x2002),
|
||||
"epsilon" => U(0x03B5),
|
||||
"equiv" => U(0x2261),
|
||||
"eta" => U(0x03B7),
|
||||
"eth" => U(0xF0),
|
||||
"euml" => U(0xEB),
|
||||
"euro" => U(0x20AC),
|
||||
"exist" => U(0x2203),
|
||||
"fnof" => U(0x0192),
|
||||
"forall" => U(0x2200),
|
||||
"frac12" => U(0xBD),
|
||||
"frac14" => U(0xBC),
|
||||
"frac34" => U(0xBE),
|
||||
"frasl" => U(0x2044),
|
||||
"gamma" => U(0x03B3),
|
||||
"ge" => U(0x2265),
|
||||
"gt" => U(0x3E),
|
||||
"GT" => U(0x3E),
|
||||
"hArr" => U(0x21D4),
|
||||
"harr" => U(0x2194),
|
||||
"hearts" => U(0x2665),
|
||||
"hellip" => U(0x2026),
|
||||
"iacute" => U(0xED),
|
||||
"icirc" => U(0xEE),
|
||||
"iexcl" => U(0xA1),
|
||||
"igrave" => U(0xEC),
|
||||
"image" => U(0x2111),
|
||||
"infin" => U(0x221E),
|
||||
"int" => U(0x222B),
|
||||
"iota" => U(0x03B9),
|
||||
"iquest" => U(0xBF),
|
||||
"isin" => U(0x2208),
|
||||
"iuml" => U(0xEF),
|
||||
"kappa" => U(0x03BA),
|
||||
"lArr" => U(0x21D0),
|
||||
"lambda" => U(0x03BB),
|
||||
"lang" => U(0x2329),
|
||||
"laquo" => U(0xAB),
|
||||
"larr" => U(0x2190),
|
||||
"lceil" => U(0x2308),
|
||||
"ldquo" => U(0x201C),
|
||||
"le" => U(0x2264),
|
||||
"lfloor" => U(0x230A),
|
||||
"lowast" => U(0x2217),
|
||||
"loz" => U(0x25CA),
|
||||
"lrm" => U(0x200E),
|
||||
"lsaquo" => U(0x2039),
|
||||
"lsquo" => U(0x2018),
|
||||
"lt" => U(0x3C),
|
||||
"LT" => U(0x3C),
|
||||
"macr" => U(0xAF),
|
||||
"mdash" => U(0x2014),
|
||||
"micro" => U(0xB5),
|
||||
"middot" => U(0xB7),
|
||||
"minus" => U(0x2212),
|
||||
"mu" => U(0x03BC),
|
||||
"nabla" => U(0x2207),
|
||||
"nbsp" => U(0xA0),
|
||||
"ndash" => U(0x2013),
|
||||
"ne" => U(0x2260),
|
||||
"ni" => U(0x220B),
|
||||
"not" => U(0xAC),
|
||||
"notin" => U(0x2209),
|
||||
"nsub" => U(0x2284),
|
||||
"ntilde" => U(0xF1),
|
||||
"nu" => U(0x03BD),
|
||||
"oacute" => U(0xF3),
|
||||
"ocirc" => U(0xF4),
|
||||
"oelig" => U(0x0153),
|
||||
"ograve" => U(0xF2),
|
||||
"oline" => U(0x203E),
|
||||
"omega" => U(0x03C9),
|
||||
"omicron" => U(0x03BF),
|
||||
"oplus" => U(0x2295),
|
||||
"or" => U(0x2228),
|
||||
"ordf" => U(0xAA),
|
||||
"ordm" => U(0xBA),
|
||||
"oslash" => U(0xF8),
|
||||
"otilde" => U(0xF5),
|
||||
"otimes" => U(0x2297),
|
||||
"ouml" => U(0xF6),
|
||||
"para" => U(0xB6),
|
||||
"part" => U(0x2202),
|
||||
"permil" => U(0x2030),
|
||||
"perp" => U(0x22A5),
|
||||
"phi" => U(0x03C6),
|
||||
"pi" => U(0x03C0),
|
||||
"piv" => U(0x03D6),
|
||||
"plusmn" => U(0xB1),
|
||||
"pound" => U(0xA3),
|
||||
"prime" => U(0x2032),
|
||||
"prod" => U(0x220F),
|
||||
"prop" => U(0x221D),
|
||||
"psi" => U(0x03C8),
|
||||
"quot" => U(0x22),
|
||||
"QUOT" => U(0x22),
|
||||
"rArr" => U(0x21D2),
|
||||
"radic" => U(0x221A),
|
||||
"rang" => U(0x232A),
|
||||
"raquo" => U(0xBB),
|
||||
"rarr" => U(0x2192),
|
||||
"rceil" => U(0x2309),
|
||||
"rdquo" => U(0x201D),
|
||||
"real" => U(0x211C),
|
||||
"reg" => U(0xAE),
|
||||
"REG" => U(0xAE),
|
||||
"rfloor" => U(0x230B),
|
||||
"rho" => U(0x03C1),
|
||||
"rlm" => U(0x200F),
|
||||
"rsaquo" => U(0x203A),
|
||||
"rsquo" => U(0x2019),
|
||||
"sbquo" => U(0x201A),
|
||||
"scaron" => U(0x0161),
|
||||
"sdot" => U(0x22C5),
|
||||
"sect" => U(0xA7),
|
||||
"shy" => U(0xAD),
|
||||
"sigma" => U(0x03C3),
|
||||
"sigmaf" => U(0x03C2),
|
||||
"sim" => U(0x223C),
|
||||
"spades" => U(0x2660),
|
||||
"sub" => U(0x2282),
|
||||
"sube" => U(0x2286),
|
||||
"sum" => U(0x2211),
|
||||
"sup" => U(0x2283),
|
||||
"sup1" => U(0xB9),
|
||||
"sup2" => U(0xB2),
|
||||
"sup3" => U(0xB3),
|
||||
"supe" => U(0x2287),
|
||||
"szlig" => U(0xDF),
|
||||
"tau" => U(0x03C4),
|
||||
"there4" => U(0x2234),
|
||||
"theta" => U(0x03B8),
|
||||
"thetasym" => U(0x03D1),
|
||||
"thinsp" => U(0x2009),
|
||||
"thorn" => U(0xFE),
|
||||
"tilde" => U(0x02DC),
|
||||
"times" => U(0xD7),
|
||||
"trade" => U(0x2122),
|
||||
"uArr" => U(0x21D1),
|
||||
"uacute" => U(0xFA),
|
||||
"uarr" => U(0x2191),
|
||||
"ucirc" => U(0xFB),
|
||||
"ugrave" => U(0xF9),
|
||||
"uml" => U(0xA8),
|
||||
"upsih" => U(0x03D2),
|
||||
"upsilon" => U(0x03C5),
|
||||
"uuml" => U(0xFC),
|
||||
"weierp" => U(0x2118),
|
||||
"xi" => U(0x03BE),
|
||||
"yacute" => U(0xFD),
|
||||
"yen" => U(0xA5),
|
||||
"yuml" => U(0xFF),
|
||||
"zeta" => U(0x03B6),
|
||||
"zwj" => U(0x200D),
|
||||
"zwnj" => U(0x200C)
|
||||
}
|
||||
|
||||
ENCODINGS = %w[
|
||||
ansi_x3.4-1968
|
||||
iso-ir-6
|
||||
ansi_x3.4-1986
|
||||
iso_646.irv:1991
|
||||
ascii
|
||||
iso646-us
|
||||
us-ascii
|
||||
us
|
||||
ibm367
|
||||
cp367
|
||||
csascii
|
||||
ks_c_5601-1987
|
||||
korean
|
||||
iso-2022-kr
|
||||
csiso2022kr
|
||||
euc-kr
|
||||
iso-2022-jp
|
||||
csiso2022jp
|
||||
iso-2022-jp-2
|
||||
iso-ir-58
|
||||
chinese
|
||||
csiso58gb231280
|
||||
iso_8859-1:1987
|
||||
iso-ir-100
|
||||
iso_8859-1
|
||||
iso-8859-1
|
||||
latin1
|
||||
l1
|
||||
ibm819
|
||||
cp819
|
||||
csisolatin1
|
||||
iso_8859-2:1987
|
||||
iso-ir-101
|
||||
iso_8859-2
|
||||
iso-8859-2
|
||||
latin2
|
||||
l2
|
||||
csisolatin2
|
||||
iso_8859-3:1988
|
||||
iso-ir-109
|
||||
iso_8859-3
|
||||
iso-8859-3
|
||||
latin3
|
||||
l3
|
||||
csisolatin3
|
||||
iso_8859-4:1988
|
||||
iso-ir-110
|
||||
iso_8859-4
|
||||
iso-8859-4
|
||||
latin4
|
||||
l4
|
||||
csisolatin4
|
||||
iso_8859-6:1987
|
||||
iso-ir-127
|
||||
iso_8859-6
|
||||
iso-8859-6
|
||||
ecma-114
|
||||
asmo-708
|
||||
arabic
|
||||
csisolatinarabic
|
||||
iso_8859-7:1987
|
||||
iso-ir-126
|
||||
iso_8859-7
|
||||
iso-8859-7
|
||||
elot_928
|
||||
ecma-118
|
||||
greek
|
||||
greek8
|
||||
csisolatingreek
|
||||
iso_8859-8:1988
|
||||
iso-ir-138
|
||||
iso_8859-8
|
||||
iso-8859-8
|
||||
hebrew
|
||||
csisolatinhebrew
|
||||
iso_8859-5:1988
|
||||
iso-ir-144
|
||||
iso_8859-5
|
||||
iso-8859-5
|
||||
cyrillic
|
||||
csisolatincyrillic
|
||||
iso_8859-9:1989
|
||||
iso-ir-148
|
||||
iso_8859-9
|
||||
iso-8859-9
|
||||
latin5
|
||||
l5
|
||||
csisolatin5
|
||||
iso-8859-10
|
||||
iso-ir-157
|
||||
l6
|
||||
iso_8859-10:1992
|
||||
csisolatin6
|
||||
latin6
|
||||
hp-roman8
|
||||
roman8
|
||||
r8
|
||||
ibm037
|
||||
cp037
|
||||
csibm037
|
||||
ibm424
|
||||
cp424
|
||||
csibm424
|
||||
ibm437
|
||||
cp437
|
||||
437
|
||||
cspc8codepage437
|
||||
ibm500
|
||||
cp500
|
||||
csibm500
|
||||
ibm775
|
||||
cp775
|
||||
cspc775baltic
|
||||
ibm850
|
||||
cp850
|
||||
850
|
||||
cspc850multilingual
|
||||
ibm852
|
||||
cp852
|
||||
852
|
||||
cspcp852
|
||||
ibm855
|
||||
cp855
|
||||
855
|
||||
csibm855
|
||||
ibm857
|
||||
cp857
|
||||
857
|
||||
csibm857
|
||||
ibm860
|
||||
cp860
|
||||
860
|
||||
csibm860
|
||||
ibm861
|
||||
cp861
|
||||
861
|
||||
cp-is
|
||||
csibm861
|
||||
ibm862
|
||||
cp862
|
||||
862
|
||||
cspc862latinhebrew
|
||||
ibm863
|
||||
cp863
|
||||
863
|
||||
csibm863
|
||||
ibm864
|
||||
cp864
|
||||
csibm864
|
||||
ibm865
|
||||
cp865
|
||||
865
|
||||
csibm865
|
||||
ibm866
|
||||
cp866
|
||||
866
|
||||
csibm866
|
||||
ibm869
|
||||
cp869
|
||||
869
|
||||
cp-gr
|
||||
csibm869
|
||||
ibm1026
|
||||
cp1026
|
||||
csibm1026
|
||||
koi8-r
|
||||
cskoi8r
|
||||
koi8-u
|
||||
big5-hkscs
|
||||
ptcp154
|
||||
csptcp154
|
||||
pt154
|
||||
cp154
|
||||
utf-7
|
||||
utf-16be
|
||||
utf-16le
|
||||
utf-16
|
||||
utf-8
|
||||
iso-8859-13
|
||||
iso-8859-14
|
||||
iso-ir-199
|
||||
iso_8859-14:1998
|
||||
iso_8859-14
|
||||
latin8
|
||||
iso-celtic
|
||||
l8
|
||||
iso-8859-15
|
||||
iso_8859-15
|
||||
iso-8859-16
|
||||
iso-ir-226
|
||||
iso_8859-16:2001
|
||||
iso_8859-16
|
||||
latin10
|
||||
l10
|
||||
gbk
|
||||
cp936
|
||||
ms936
|
||||
gb18030
|
||||
shift_jis
|
||||
ms_kanji
|
||||
csshiftjis
|
||||
euc-jp
|
||||
gb2312
|
||||
big5
|
||||
csbig5
|
||||
windows-1250
|
||||
windows-1251
|
||||
windows-1252
|
||||
windows-1253
|
||||
windows-1254
|
||||
windows-1255
|
||||
windows-1256
|
||||
windows-1257
|
||||
windows-1258
|
||||
tis-620
|
||||
hz-gb-2312
|
||||
]
|
||||
|
||||
end
|
2020
vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
vendored
Normal file
2020
vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
vendored
Normal file
File diff suppressed because it is too large
Load diff
549
vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
vendored
Executable file
549
vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
vendored
Executable file
|
@ -0,0 +1,549 @@
|
|||
require 'stringio'
|
||||
require 'html5lib/constants'
|
||||
|
||||
module HTML5lib
|
||||
|
||||
# Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
# This class takes care of character encoding and removing or replacing
|
||||
# incorrect byte-sequences and also provides column and line tracking.
|
||||
|
||||
class HTMLInputStream
|
||||
|
||||
attr_accessor :queue, :charEncoding
|
||||
|
||||
# Initialises the HTMLInputStream.
|
||||
#
|
||||
# HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
# for use by the HTML5Lib.
|
||||
#
|
||||
# source can be either a file-object, local filename or a string.
|
||||
#
|
||||
# The optional encoding parameter must be a string that indicates
|
||||
# the encoding. If specified, that encoding will be used,
|
||||
# regardless of any BOM or later declaration (such as in a meta
|
||||
# element)
|
||||
#
|
||||
# parseMeta - Look for a <meta> element containing encoding information
|
||||
|
||||
def initialize(source, options = {})
|
||||
@encoding = nil
|
||||
@parseMeta = true
|
||||
@chardet = true
|
||||
|
||||
options.each { |name, value| instance_variable_set("@#{name}", value) }
|
||||
|
||||
# List of where new lines occur
|
||||
@newLines = []
|
||||
|
||||
# Raw Stream
|
||||
@rawStream = openStream(source)
|
||||
|
||||
# Encoding Information
|
||||
#Number of bytes to use when looking for a meta element with
|
||||
#encoding information
|
||||
@NUM_BYTES_META = 512
|
||||
#Encoding to use if no other information can be found
|
||||
@DEFAULT_ENCODING = 'windows-1252'
|
||||
|
||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||
if @encoding.nil? or not HTML5lib.isValidEncoding(@encoding)
|
||||
@charEncoding = detectEncoding
|
||||
else
|
||||
@charEncoding = @encoding
|
||||
end
|
||||
|
||||
# Read bytes from stream decoding them into Unicode
|
||||
uString = @rawStream.read
|
||||
unless @charEncoding == 'utf-8'
|
||||
begin
|
||||
require 'iconv'
|
||||
uString = Iconv.iconv('utf-8', @encoding, uString)[0]
|
||||
rescue
|
||||
end
|
||||
end
|
||||
|
||||
# Normalize newlines and null characters
|
||||
uString.gsub!(/\r\n?/, "\n")
|
||||
uString.gsub!("\x00", [0xFFFD].pack('U'))
|
||||
|
||||
# Convert the unicode string into a list to be used as the data stream
|
||||
@dataStream = uString
|
||||
|
||||
@queue = []
|
||||
|
||||
# Reset position in the list to read from
|
||||
reset
|
||||
end
|
||||
|
||||
# Produces a file object from source.
|
||||
#
|
||||
# source can be either a file object, local filename or a string.
|
||||
def openStream(source)
|
||||
# Already an IO like object
|
||||
if source.respond_to?(:read)
|
||||
@stream = source
|
||||
else
|
||||
# Treat source as a string and wrap in StringIO
|
||||
@stream = StringIO.new(source)
|
||||
end
|
||||
return @stream
|
||||
end
|
||||
|
||||
def detectEncoding
|
||||
|
||||
#First look for a BOM
|
||||
#This will also read past the BOM if present
|
||||
encoding = detectBOM
|
||||
#If there is no BOM need to look for meta elements with encoding
|
||||
#information
|
||||
if encoding.nil? and @parseMeta
|
||||
encoding = detectEncodingMeta
|
||||
end
|
||||
#Guess with chardet, if avaliable
|
||||
if encoding.nil? and @chardet
|
||||
begin
|
||||
require 'rubygems'
|
||||
require 'UniversalDetector' # gem install chardet
|
||||
buffer = @rawStream.read
|
||||
encoding = UniversalDetector::chardet(buffer)['encoding']
|
||||
@rawStream = openStream(buffer)
|
||||
rescue LoadError
|
||||
end
|
||||
end
|
||||
# If all else fails use the default encoding
|
||||
if encoding.nil?
|
||||
encoding = @DEFAULT_ENCODING
|
||||
end
|
||||
|
||||
#Substitute for equivalent encodings:
|
||||
encodingSub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
|
||||
|
||||
if encodingSub.has_key?(encoding.downcase)
|
||||
encoding = encodingSub[encoding.downcase]
|
||||
end
|
||||
|
||||
return encoding
|
||||
end
|
||||
|
||||
# Attempts to detect at BOM at the start of the stream. If
|
||||
# an encoding can be determined from the BOM return the name of the
|
||||
# encoding otherwise return nil
|
||||
def detectBOM
|
||||
bomDict = {
|
||||
"\xef\xbb\xbf" => 'utf-8',
|
||||
"\xff\xfe" => 'utf-16-le',
|
||||
"\xfe\xff" => 'utf-16-be',
|
||||
"\xff\xfe\x00\x00" => 'utf-32-le',
|
||||
"\x00\x00\xfe\xff" => 'utf-32-be'
|
||||
}
|
||||
|
||||
# Go to beginning of file and read in 4 bytes
|
||||
@rawStream.seek(0)
|
||||
string = @rawStream.read(4)
|
||||
return nil unless string
|
||||
|
||||
# Try detecting the BOM using bytes from the string
|
||||
encoding = bomDict[string[0...3]] # UTF-8
|
||||
seek = 3
|
||||
unless encoding
|
||||
# Need to detect UTF-32 before UTF-16
|
||||
encoding = bomDict[string] # UTF-32
|
||||
seek = 4
|
||||
unless encoding
|
||||
encoding = bomDict[string[0...2]] # UTF-16
|
||||
seek = 2
|
||||
end
|
||||
end
|
||||
|
||||
#AT - move this to the caller?
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
@rawStream.seek(encoding ? seek : 0)
|
||||
|
||||
return encoding
|
||||
end
|
||||
|
||||
# Report the encoding declared by the meta element
|
||||
def detectEncodingMeta
|
||||
parser = EncodingParser.new(@rawStream.read(@NUM_BYTES_META))
|
||||
@rawStream.seek(0)
|
||||
return parser.getEncoding
|
||||
end
|
||||
|
||||
def determineNewLines
|
||||
# Looks through the stream to find where new lines occur so
|
||||
# the position method can tell where it is.
|
||||
@newLines.push(0)
|
||||
(0...@dataStream.length).each { |i| @newLines.push(i) if @dataStream[i] == ?\n }
|
||||
end
|
||||
|
||||
# Returns (line, col) of the current position in the stream.
|
||||
def position
|
||||
# Generate list of new lines first time around
|
||||
determineNewLines if @newLines.empty?
|
||||
line = 0
|
||||
tell = @tell
|
||||
@newLines.each do |pos|
|
||||
break unless pos < tell
|
||||
line += 1
|
||||
end
|
||||
col = tell - @newLines[line-1] - 1
|
||||
return [line, col]
|
||||
end
|
||||
|
||||
# Resets the position in the stream back to the start.
|
||||
def reset
|
||||
@tell = 0
|
||||
end
|
||||
|
||||
# Read one character from the stream or queue if available. Return
|
||||
# EOF when EOF is reached.
|
||||
def char
|
||||
unless @queue.empty?
|
||||
return @queue.shift
|
||||
else
|
||||
begin
|
||||
@tell += 1
|
||||
return @dataStream[@tell - 1].chr
|
||||
rescue
|
||||
return :EOF
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Returns a string of characters from the stream up to but not
|
||||
# including any character in characters or EOF. characters can be
|
||||
# any container that supports the in method being called on it.
|
||||
def charsUntil(characters, opposite = false)
|
||||
charStack = [char]
|
||||
|
||||
unless charStack[0] == :EOF
|
||||
while (characters.include? charStack[-1]) == opposite
|
||||
unless @queue.empty?
|
||||
# First from the queue
|
||||
charStack.push(@queue.shift)
|
||||
break if charStack[-1] == :EOF
|
||||
else
|
||||
# Then the rest
|
||||
begin
|
||||
charStack.push(@dataStream[@tell].chr)
|
||||
@tell += 1
|
||||
rescue
|
||||
charStack.push(:EOF)
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Put the character stopped on back to the front of the queue
|
||||
# from where it came.
|
||||
@queue.insert(0, charStack.pop)
|
||||
return charStack.join('')
|
||||
end
|
||||
end
|
||||
|
||||
# String-like object with an assosiated position and various extra methods
|
||||
# If the position is ever greater than the string length then an exception is raised
|
||||
class EncodingBytes < String
|
||||
|
||||
attr_accessor :position
|
||||
|
||||
def initialize(value)
|
||||
super(value)
|
||||
@position = -1
|
||||
end
|
||||
|
||||
def each
|
||||
while @position < length
|
||||
@position += 1
|
||||
yield self[@position]
|
||||
end
|
||||
rescue EOF
|
||||
end
|
||||
|
||||
def currentByte
|
||||
raise EOF if @position >= length
|
||||
return self[@position].chr
|
||||
end
|
||||
|
||||
# Skip past a list of characters
|
||||
def skip(chars = SPACE_CHARACTERS)
|
||||
while chars.include?(currentByte)
|
||||
@position += 1
|
||||
end
|
||||
end
|
||||
|
||||
# Look for a sequence of bytes at the start of a string. If the bytes
|
||||
# are found return true and advance the position to the byte after the
|
||||
# match. Otherwise return false and leave the position alone
|
||||
def matchBytes(bytes, lower = false)
|
||||
data = self[position ... position+bytes.length]
|
||||
data.downcase! if lower
|
||||
rv = (data == bytes)
|
||||
@position += bytes.length if rv == true
|
||||
return rv
|
||||
end
|
||||
|
||||
# Look for the next sequence of bytes matching a given sequence. If
|
||||
# a match is found advance the position to the last byte of the match
|
||||
def jumpTo(bytes)
|
||||
newPosition = self[position .. -1].index(bytes)
|
||||
if newPosition
|
||||
@position += (newPosition + bytes.length-1)
|
||||
return true
|
||||
else
|
||||
raise EOF
|
||||
end
|
||||
end
|
||||
|
||||
# Move the pointer so it points to the next byte in a set of possible
|
||||
# bytes
|
||||
def findNext(byteList)
|
||||
until byteList.include?(currentByte)
|
||||
@position += 1
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Mini parser for detecting character encoding from meta elements
|
||||
class EncodingParser
|
||||
|
||||
# string - the data to work on for encoding detection
|
||||
def initialize(data)
|
||||
@data = EncodingBytes.new(data.to_s)
|
||||
@encoding = nil
|
||||
end
|
||||
|
||||
@@method_dispatch = [
|
||||
['<!--', :handleComment],
|
||||
['<meta', :handleMeta],
|
||||
['</', :handlePossibleEndTag],
|
||||
['<!', :handleOther],
|
||||
['<?', :handleOther],
|
||||
['<', :handlePossibleStartTag]
|
||||
]
|
||||
|
||||
def getEncoding
|
||||
@data.each do |byte|
|
||||
keepParsing = true
|
||||
@@method_dispatch.each do |(key, method)|
|
||||
if @data.matchBytes(key, lower = true)
|
||||
keepParsing = send(method)
|
||||
break
|
||||
end
|
||||
end
|
||||
break unless keepParsing
|
||||
end
|
||||
@encoding = @encoding.strip unless @encoding.nil?
|
||||
return @encoding
|
||||
end
|
||||
|
||||
# Skip over comments
|
||||
def handleComment
|
||||
return @data.jumpTo('-->')
|
||||
end
|
||||
|
||||
def handleMeta
|
||||
# if we have <meta not followed by a space so just keep going
|
||||
return true unless SPACE_CHARACTERS.include?(@data.currentByte)
|
||||
|
||||
#We have a valid meta element we want to search for attributes
|
||||
while true
|
||||
#Try to find the next attribute after the current position
|
||||
attr = getAttribute
|
||||
|
||||
return true if attr.nil?
|
||||
|
||||
if attr[0] == 'charset'
|
||||
tentativeEncoding = attr[1]
|
||||
if HTML5lib.isValidEncoding(tentativeEncoding)
|
||||
@encoding = tentativeEncoding
|
||||
return false
|
||||
end
|
||||
elsif attr[0] == 'content'
|
||||
contentParser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
|
||||
tentativeEncoding = contentParser.parse
|
||||
if HTML5lib.isValidEncoding(tentativeEncoding)
|
||||
@encoding = tentativeEncoding
|
||||
return false
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def handlePossibleStartTag
|
||||
return handlePossibleTag(false)
|
||||
end
|
||||
|
||||
def handlePossibleEndTag
|
||||
@data.position+=1
|
||||
return handlePossibleTag(true)
|
||||
end
|
||||
|
||||
def handlePossibleTag(endTag)
|
||||
unless ASCII_LETTERS.include?(@data.currentByte)
|
||||
#If the next byte is not an ascii letter either ignore this
|
||||
#fragment (possible start tag case) or treat it according to
|
||||
#handleOther
|
||||
if endTag
|
||||
@data.position -= 1
|
||||
handleOther
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
@data.findNext(SPACE_CHARACTERS + ['<', '>'])
|
||||
|
||||
if @data.currentByte == '<'
|
||||
#return to the first step in the overall "two step" algorithm
|
||||
#reprocessing the < byte
|
||||
@data.position -= 1
|
||||
else
|
||||
#Read all attributes
|
||||
{} until getAttribute.nil?
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def handleOther
|
||||
return @data.jumpTo('>')
|
||||
end
|
||||
|
||||
# Return a name,value pair for the next attribute in the stream,
|
||||
# if one is found, or nil
|
||||
def getAttribute
|
||||
@data.skip(SPACE_CHARACTERS + ['/'])
|
||||
|
||||
if @data.currentByte == '<'
|
||||
@data.position -= 1
|
||||
return nil
|
||||
elsif @data.currentByte == '>'
|
||||
return nil
|
||||
end
|
||||
|
||||
attrName = []
|
||||
attrValue = []
|
||||
spaceFound = false
|
||||
#Step 5 attribute name
|
||||
while true
|
||||
if @data.currentByte == '=' and attrName:
|
||||
break
|
||||
elsif SPACE_CHARACTERS.include?(@data.currentByte)
|
||||
spaceFound = true
|
||||
break
|
||||
elsif ['/', '<', '>'].include?(@data.currentByte)
|
||||
return [attrName.join(''), '']
|
||||
elsif ASCII_UPPERCASE.include?(@data.currentByte)
|
||||
attrName.push(@data.currentByte.downcase)
|
||||
else
|
||||
attrName.push(@data.currentByte)
|
||||
end
|
||||
#Step 6
|
||||
@data.position += 1
|
||||
end
|
||||
#Step 7
|
||||
if spaceFound
|
||||
@data.skip
|
||||
#Step 8
|
||||
unless @data.currentByte == '='
|
||||
@data.position -= 1
|
||||
return [attrName.join(''), '']
|
||||
end
|
||||
end
|
||||
#XXX need to advance position in both spaces and value case
|
||||
#Step 9
|
||||
@data.position += 1
|
||||
#Step 10
|
||||
@data.skip
|
||||
#Step 11
|
||||
if ["'", '"'].include?(@data.currentByte)
|
||||
#11.1
|
||||
quoteChar = @data.currentByte
|
||||
while true
|
||||
@data.position+=1
|
||||
#11.3
|
||||
if @data.currentByte == quoteChar
|
||||
@data.position += 1
|
||||
return [attrName.join(''), attrValue.join('')]
|
||||
#11.4
|
||||
elsif ASCII_UPPERCASE.include?(@data.currentByte)
|
||||
attrValue.push(@data.currentByte.downcase)
|
||||
#11.5
|
||||
else
|
||||
attrValue.push(@data.currentByte)
|
||||
end
|
||||
end
|
||||
elsif ['>', '<'].include?(@data.currentByte)
|
||||
return [attrName.join(''), '']
|
||||
elsif ASCII_UPPERCASE.include?(@data.currentByte)
|
||||
attrValue.push(@data.currentByte.downcase)
|
||||
else
|
||||
attrValue.push(@data.currentByte)
|
||||
end
|
||||
while true
|
||||
@data.position +=1
|
||||
if (SPACE_CHARACTERS + ['>', '<']).include?(@data.currentByte)
|
||||
return [attrName.join(''), attrValue.join('')]
|
||||
elsif ASCII_UPPERCASE.include?(@data.currentByte)
|
||||
attrValue.push(@data.currentByte.downcase)
|
||||
else
|
||||
attrValue.push(@data.currentByte)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class ContentAttrParser
|
||||
def initialize(data)
|
||||
@data = data
|
||||
end
|
||||
def parse
|
||||
begin
|
||||
#Skip to the first ";"
|
||||
@data.position = 0
|
||||
@data.jumpTo(';')
|
||||
@data.position += 1
|
||||
@data.skip
|
||||
#Check if the attr name is charset
|
||||
#otherwise return
|
||||
@data.jumpTo('charset')
|
||||
@data.position += 1
|
||||
@data.skip
|
||||
unless @data.currentByte == '='
|
||||
#If there is no = sign keep looking for attrs
|
||||
return nil
|
||||
end
|
||||
@data.position += 1
|
||||
@data.skip
|
||||
#Look for an encoding between matching quote marks
|
||||
if ['"', "'"].include?(@data.currentByte)
|
||||
quoteMark = @data.currentByte
|
||||
@data.position += 1
|
||||
oldPosition = @data.position
|
||||
@data.jumpTo(quoteMark)
|
||||
return @data[oldPosition ... @data.position]
|
||||
else
|
||||
#Unquoted value
|
||||
oldPosition = @data.position
|
||||
begin
|
||||
@data.findNext(SPACE_CHARACTERS)
|
||||
return @data[oldPosition ... @data.position]
|
||||
rescue EOF
|
||||
#Return the whole remaining value
|
||||
return @data[oldPosition .. -1]
|
||||
end
|
||||
end
|
||||
rescue EOF
|
||||
return nil
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Determine if a string is a supported encoding
|
||||
def self.isValidEncoding(encoding)
|
||||
(not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
|
||||
end
|
||||
|
||||
end
|
141
vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb
vendored
Executable file
141
vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb
vendored
Executable file
|
@ -0,0 +1,141 @@
|
|||
# Warning: this module is experimental and subject to change and even removal
|
||||
# at any time.
|
||||
#
|
||||
# For background/rationale, see:
|
||||
# * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
|
||||
# * http://tinyurl.com/ylfj8k (and follow-ups)
|
||||
#
|
||||
# References:
|
||||
# * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
|
||||
# * http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
||||
#
|
||||
# @@TODO:
|
||||
# * Selectively lowercase only XHTML, but not foreign markup
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5lib/constants'
|
||||
|
||||
module HTML5lib
|
||||
|
||||
# liberal XML parser
|
||||
class XMLParser < HTMLParser
|
||||
|
||||
def initialize(options={})
|
||||
super options
|
||||
@phases[:initial] = XmlRootPhase.new(self, @tree)
|
||||
end
|
||||
|
||||
def normalizeToken(token)
|
||||
if token[:type] == :StartTag or token[:type] == :EmptyTag
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
token[:data] = Hash[*token[:data].reverse.flatten]
|
||||
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token[:type] == :EmptyTag
|
||||
@phase.processStartTag(token[:name], token[:data])
|
||||
token[:data] = {}
|
||||
token[:type] = :EndTag
|
||||
end
|
||||
|
||||
elsif token[:type] == :EndTag
|
||||
if token[:data]
|
||||
parseError(_("End tag contains unexpected attributes."))
|
||||
end
|
||||
|
||||
elsif token[:type] == :Comment
|
||||
# Rescue CDATA from the comments
|
||||
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
|
||||
token[:type] = :Characters
|
||||
token[:data] = token[:data][7 ... -2]
|
||||
end
|
||||
end
|
||||
|
||||
return token
|
||||
end
|
||||
end
|
||||
|
||||
# liberal XMTHML parser
|
||||
class XHTMLParser < XMLParser
|
||||
|
||||
def initialize(options={})
|
||||
super options
|
||||
@phases[:initial] = InitialPhase.new(self, @tree)
|
||||
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
|
||||
end
|
||||
|
||||
def normalizeToken(token)
|
||||
super(token)
|
||||
|
||||
# ensure that non-void XHTML elements have content so that separate
|
||||
# open and close tags are emitted
|
||||
if token[:type] == :EndTag and \
|
||||
not VOID_ELEMENTS.include? token[:name] and \
|
||||
token[:name] == @tree.openElements[-1].name and \
|
||||
not @tree.openElements[-1].hasContent
|
||||
@tree.insertText('') unless
|
||||
@tree.openElements.any? {|e|
|
||||
e.attributes.keys.include? 'xmlns' and
|
||||
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
|
||||
}
|
||||
end
|
||||
|
||||
return token
|
||||
end
|
||||
end
|
||||
|
||||
class XhmlRootPhase < RootElementPhase
|
||||
def insertHtmlElement
|
||||
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
|
||||
@tree.openElements.push(element)
|
||||
@tree.document.appendChild(element)
|
||||
@parser.phase = @parser.phases[:beforeHead]
|
||||
end
|
||||
end
|
||||
|
||||
class XmlRootPhase < Phase
|
||||
# Prime the Xml parser
|
||||
@start_tag_handlers = Hash.new(:startTagOther)
|
||||
@end_tag_handlers = Hash.new(:endTagOther)
|
||||
def startTagOther(name, attributes)
|
||||
@tree.openElements.push(@tree.document)
|
||||
element = @tree.createElement(name, attributes)
|
||||
@tree.openElements[-1].appendChild(element)
|
||||
@tree.openElements.push(element)
|
||||
@parser.phase = XmlElementPhase.new(@parser,@tree)
|
||||
end
|
||||
def endTagOther(name)
|
||||
super
|
||||
@tree.openElements.pop
|
||||
end
|
||||
end
|
||||
|
||||
class XmlElementPhase < Phase
|
||||
# Generic handling for all XML elements
|
||||
|
||||
@start_tag_handlers = Hash.new(:startTagOther)
|
||||
@end_tag_handlers = Hash.new(:endTagOther)
|
||||
|
||||
def startTagOther(name, attributes)
|
||||
element = @tree.createElement(name, attributes)
|
||||
@tree.openElements[-1].appendChild(element)
|
||||
@tree.openElements.push(element)
|
||||
end
|
||||
|
||||
def endTagOther(name)
|
||||
for node in @tree.openElements.reverse
|
||||
if node.name == name
|
||||
{} while @tree.openElements.pop != node
|
||||
break
|
||||
else
|
||||
@parser.parseError
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def processCharacters(data)
|
||||
@tree.insertText(data)
|
||||
end
|
||||
end
|
||||
|
||||
end
|
178
vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
vendored
Normal file
178
vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
vendored
Normal file
|
@ -0,0 +1,178 @@
|
|||
require 'html5lib/tokenizer'
|
||||
require 'cgi'
|
||||
|
||||
module HTML5lib
|
||||
|
||||
# This module provides sanitization of XHTML+MathML+SVG
|
||||
# and of inline style attributes.
|
||||
|
||||
class HTMLSanitizer < HTMLTokenizer
|
||||
|
||||
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
|
||||
button caption center cite code col colgroup dd del dfn dir div dl dt
|
||||
em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
|
||||
legend li map menu ol optgroup option p pre q s samp select small span
|
||||
strike strong sub sup table tbody td textarea tfoot th thead tr tt u
|
||||
ul var]
|
||||
|
||||
MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo
|
||||
mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub
|
||||
msubsup msup mtable mtd mtext mtr munder munderover none]
|
||||
|
||||
SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
|
||||
circle defs desc ellipse font-face font-face-name font-face-src g
|
||||
glyph hkern image linearGradient line marker metadata missing-glyph
|
||||
mpath path polygon polyline radialGradient rect set stop svg switch
|
||||
text title tspan use]
|
||||
|
||||
ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
|
||||
align alt axis border cellpadding cellspacing char charoff charset
|
||||
checked cite class clear cols colspan color compact coords datetime
|
||||
dir disabled enctype for frame headers height href hreflang hspace id
|
||||
ismap label lang longdesc maxlength media method multiple name nohref
|
||||
noshade nowrap prompt readonly rel rev rows rowspan rules scope
|
||||
selected shape size span src start style summary tabindex target title
|
||||
type usemap valign value vspace width xml:lang]
|
||||
|
||||
MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign
|
||||
columnalign columnlines columnspacing columnspan depth display
|
||||
displaystyle equalcolumns equalrows fence fontstyle fontweight frame
|
||||
height linethickness lspace mathbackground mathcolor mathvariant
|
||||
mathvariant maxsize minsize other rowalign rowalign rowalign rowlines
|
||||
rowspacing rowspan rspace scriptlevel selection separator stretchy
|
||||
width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
|
||||
|
||||
SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
|
||||
arabic-form ascent attributeName attributeType baseProfile bbox begin
|
||||
by calcMode cap-height class color color-rendering content cx cy d dx
|
||||
dy descent display dur end fill fill-rule font-family font-size
|
||||
font-stretch font-style font-variant font-weight from fx fy g1 g2
|
||||
glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
|
||||
ideographic k keyPoints keySplines keyTimes lang marker-end
|
||||
marker-mid marker-start markerHeight markerUnits markerWidth
|
||||
mathematical max min name offset opacity orient origin
|
||||
overline-position overline-thickness panose-1 path pathLength points
|
||||
preserveAspectRatio r refX refY repeatCount repeatDur
|
||||
requiredExtensions requiredFeatures restart rotate rx ry slope stemh
|
||||
stemv stop-color stop-opacity strikethrough-position
|
||||
strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
|
||||
stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
|
||||
stroke-width systemLanguage target text-anchor to transform type u1
|
||||
u2 underline-position underline-thickness unicode unicode-range
|
||||
units-per-em values version viewBox visibility width widths x
|
||||
x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
|
||||
xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
|
||||
xmlns:xlink y y1 y2 zoomAndPan]
|
||||
|
||||
ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href]
|
||||
|
||||
ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
|
||||
border-bottom-color border-collapse border-color border-left-color
|
||||
border-right-color border-top-color clear color cursor direction
|
||||
display elevation float font font-family font-size font-style
|
||||
font-variant font-weight height letter-spacing line-height overflow
|
||||
pause pause-after pause-before pitch pitch-range richness speak
|
||||
speak-header speak-numeral speak-punctuation speech-rate stress
|
||||
text-align text-decoration text-indent unicode-bidi vertical-align
|
||||
voice-family volume white-space width]
|
||||
|
||||
ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
|
||||
brown center collapse dashed dotted fuchsia gray green !important
|
||||
italic left lime maroon medium none navy normal nowrap olive pointer
|
||||
purple red right solid silver teal top transparent underline white
|
||||
yellow]
|
||||
|
||||
ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
|
||||
stroke-width stroke-linecap stroke-linejoin stroke-opacity]
|
||||
|
||||
ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
|
||||
telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
|
||||
|
||||
# subclasses may define their own versions of these constants
|
||||
ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
|
||||
ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
|
||||
ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
|
||||
ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
|
||||
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
|
||||
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
||||
# attributes are parsed, and a restricted set, # specified by
|
||||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
||||
# in ALLOWED_PROTOCOLS are allowed.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def each
|
||||
super do |token|
|
||||
case token[:type]
|
||||
when :StartTag, :EndTag, :EmptyTag
|
||||
if ALLOWED_ELEMENTS.include?(token[:name])
|
||||
if token.has_key? :data
|
||||
attrs = Hash[*token[:data].flatten]
|
||||
attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
||||
ATTR_VAL_IS_URI.each do |attr|
|
||||
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
||||
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
||||
attrs.delete attr
|
||||
end
|
||||
end
|
||||
if attrs['style']
|
||||
attrs['style'] = sanitize_css(attrs['style'])
|
||||
end
|
||||
token[:data] = attrs.map {|k,v| [k,v]}
|
||||
end
|
||||
yield token
|
||||
else
|
||||
if token[:type] == :EndTag
|
||||
token[:data] = "</#{token[:name]}>"
|
||||
elsif token[:data]
|
||||
attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
|
||||
token[:data] = "<#{token[:name]}#{attrs}>"
|
||||
else
|
||||
token[:data] = "<#{token[:name]}>"
|
||||
end
|
||||
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
|
||||
token[:type] = :Characters
|
||||
token.delete(:name)
|
||||
yield token
|
||||
end
|
||||
else
|
||||
yield token
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
def sanitize_css(style)
|
||||
# disallow urls
|
||||
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
||||
|
||||
# gauntlet
|
||||
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
|
||||
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
|
||||
|
||||
clean = []
|
||||
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
|
||||
next if val.empty?
|
||||
prop.downcase!
|
||||
if ALLOWED_CSS_PROPERTIES.include?(prop)
|
||||
clean << "#{prop}: #{val};"
|
||||
elsif %w[background border margin padding].include?(prop.split('-')[0])
|
||||
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
|
||||
!ALLOWED_CSS_KEYWORDS.include?(keyword) and
|
||||
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
||||
end
|
||||
elsif ALLOWED_SVG_PROPERTIES.include?(prop)
|
||||
clean << "#{prop}: #{val};"
|
||||
end
|
||||
end
|
||||
|
||||
style = clean.join(' ')
|
||||
end
|
||||
end
|
||||
end
|
854
vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
vendored
Normal file
854
vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
vendored
Normal file
|
@ -0,0 +1,854 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/inputstream'
|
||||
|
||||
module HTML5lib
|
||||
|
||||
# This class takes care of tokenizing HTML.
|
||||
#
|
||||
# * @currentToken
|
||||
# Holds the token that is currently being processed.
|
||||
#
|
||||
# * @state
|
||||
# Holds a reference to the method to be invoked... XXX
|
||||
#
|
||||
# * @states
|
||||
# Holds a mapping between states and methods that implement the state.
|
||||
#
|
||||
# * @stream
|
||||
# Points to HTMLInputStream object.
|
||||
|
||||
class HTMLTokenizer
|
||||
attr_accessor :contentModelFlag, :currentToken
|
||||
attr_reader :stream
|
||||
|
||||
# XXX need to fix documentation
|
||||
|
||||
def initialize(stream, options={})
|
||||
@stream = HTMLInputStream.new(stream, options)
|
||||
|
||||
@states = {
|
||||
:data => :dataState,
|
||||
:entityData => :entityDataState,
|
||||
:tagOpen => :tagOpenState,
|
||||
:closeTagOpen => :closeTagOpenState,
|
||||
:tagName => :tagNameState,
|
||||
:beforeAttributeName => :beforeAttributeNameState,
|
||||
:attributeName => :attributeNameState,
|
||||
:afterAttributeName => :afterAttributeNameState,
|
||||
:beforeAttributeValue => :beforeAttributeValueState,
|
||||
:attributeValueDoubleQuoted => :attributeValueDoubleQuotedState,
|
||||
:attributeValueSingleQuoted => :attributeValueSingleQuotedState,
|
||||
:attributeValueUnQuoted => :attributeValueUnQuotedState,
|
||||
:bogusComment => :bogusCommentState,
|
||||
:markupDeclarationOpen => :markupDeclarationOpenState,
|
||||
:comment => :commentState,
|
||||
:commentDash => :commentDashState,
|
||||
:commentEnd => :commentEndState,
|
||||
:doctype => :doctypeState,
|
||||
:beforeDoctypeName => :beforeDoctypeNameState,
|
||||
:doctypeName => :doctypeNameState,
|
||||
:afterDoctypeName => :afterDoctypeNameState,
|
||||
:bogusDoctype => :bogusDoctypeState
|
||||
}
|
||||
|
||||
# Setup the initial tokenizer state
|
||||
@contentModelFlag = :PCDATA
|
||||
@state = @states[:data]
|
||||
|
||||
# The current token being created
|
||||
@currentToken = nil
|
||||
|
||||
# Tokens to be processed.
|
||||
@tokenQueue = []
|
||||
end
|
||||
|
||||
# This is where the magic happens.
|
||||
#
|
||||
# We do our usually processing through the states and when we have a token
|
||||
# to return we yield the token which pauses processing until the next token
|
||||
# is requested.
|
||||
def each
|
||||
@stream.reset
|
||||
@tokenQueue = []
|
||||
# Start processing. When EOF is reached @state will return false
|
||||
# instead of true and the loop will terminate.
|
||||
while send @state
|
||||
while not @tokenQueue.empty?
|
||||
yield @tokenQueue.shift
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Below are various helper functions the tokenizer states use worked out.
|
||||
|
||||
# If the next character is a '>', convert the currentToken into
|
||||
# an EmptyTag
|
||||
|
||||
def processSolidusInTag
|
||||
|
||||
# We need to consume another character to make sure it's a ">"
|
||||
data = @stream.char
|
||||
|
||||
if @currentToken[:type] == :StartTag and data == ">"
|
||||
@currentToken[:type] = :EmptyTag
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Solidus (/) incorrectly placed in tag.")})
|
||||
end
|
||||
|
||||
# The character we just consumed need to be put back on the stack so it
|
||||
# doesn't get lost...
|
||||
@stream.queue.push(data)
|
||||
end
|
||||
|
||||
# This function returns either U+FFFD or the character based on the
|
||||
# decimal or hexadecimal representation. It also discards ";" if present.
|
||||
# If not present @tokenQueue.push({:type => :ParseError}") is invoked.
|
||||
|
||||
def consumeNumberEntity(isHex)
|
||||
|
||||
# XXX More need to be done here. For instance, #13 should prolly be
|
||||
# converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
|
||||
# such. Thoughts on this appreciated.
|
||||
allowed = DIGITS
|
||||
radix = 10
|
||||
if isHex
|
||||
allowed = HEX_DIGITS
|
||||
radix = 16
|
||||
end
|
||||
|
||||
char = [0xFFFD].pack('U')
|
||||
charStack = []
|
||||
|
||||
# Consume all the characters that are in range while making sure we
|
||||
# don't hit an EOF.
|
||||
c = @stream.char
|
||||
while allowed.include?(c) and c != :EOF
|
||||
charStack.push(c)
|
||||
c = @stream.char
|
||||
end
|
||||
|
||||
# Convert the set of characters consumed to an int.
|
||||
charAsInt = charStack.join('').to_i(radix)
|
||||
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
||||
# smaller) we need to do the "windows trick".
|
||||
if (127...160).include? charAsInt
|
||||
#XXX - removed parse error from windows 1252 entity for now
|
||||
#we may want to reenable this later
|
||||
#@tokenQueue.push({:type => :ParseError, :data =>
|
||||
# _("Entity used with illegal number (windows-1252 reference).")})
|
||||
|
||||
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
|
||||
end
|
||||
|
||||
# 0 is not a good number.
|
||||
if charAsInt == 0
|
||||
charAsInt = 65533
|
||||
end
|
||||
|
||||
if charAsInt <= 0x10FFF
|
||||
char = [charAsInt].pack('U')
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity couldn't be converted to character.")})
|
||||
end
|
||||
|
||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||
# invoke parseError on parser.
|
||||
if c != ";"
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity didn't end with ';'.")})
|
||||
@stream.queue.push(c)
|
||||
end
|
||||
|
||||
return char
|
||||
end
|
||||
|
||||
def consumeEntity
|
||||
char = nil
|
||||
charStack = [@stream.char]
|
||||
if charStack[0] == "#"
|
||||
# We might have a number entity here.
|
||||
charStack += [@stream.char, @stream.char]
|
||||
if charStack.include? :EOF
|
||||
# If we reach the end of the file put everything up to :EOF
|
||||
# back in the queue
|
||||
charStack = charStack[0...charStack.index(:EOF)]
|
||||
@stream.queue+= charStack
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity expected. Got end of file instead.")})
|
||||
else
|
||||
if charStack[1].downcase == "x" \
|
||||
and HEX_DIGITS.include? charStack[2]
|
||||
# Hexadecimal entity detected.
|
||||
@stream.queue.push(charStack[2])
|
||||
char = consumeNumberEntity(true)
|
||||
elsif DIGITS.include? charStack[1]
|
||||
# Decimal entity detected.
|
||||
@stream.queue += charStack[1..-1]
|
||||
char = consumeNumberEntity(false)
|
||||
else
|
||||
# No number entity detected.
|
||||
@stream.queue += charStack
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity expected but none found.")})
|
||||
end
|
||||
end
|
||||
# Break out if we reach the end of the file
|
||||
elsif charStack[0] == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Entity expected. Got end of file instead.")})
|
||||
else
|
||||
# At this point in the process might have named entity. Entities
|
||||
# are stored in the global variable "entities".
|
||||
#
|
||||
# Consume characters and compare to these to a substring of the
|
||||
# entity names in the list until the substring no longer matches.
|
||||
filteredEntityList = ENTITIES.keys
|
||||
filteredEntityList.reject! {|e| e[0].chr != charStack[0]}
|
||||
entityName = nil
|
||||
|
||||
while charStack[-1] != :EOF
|
||||
name = charStack.join('')
|
||||
if filteredEntityList.any? {|e| e[0...name.length] == name}
|
||||
filteredEntityList.reject! {|e| e[0...name.length] != name}
|
||||
charStack.push(@stream.char)
|
||||
else
|
||||
break
|
||||
end
|
||||
|
||||
if ENTITIES.include? name
|
||||
entityName = name
|
||||
end
|
||||
end
|
||||
|
||||
if entityName != nil
|
||||
char = ENTITIES[entityName]
|
||||
|
||||
# Check whether or not the last character returned can be
|
||||
# discarded or needs to be put back.
|
||||
if not charStack[-1] == ";"
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Named entity didn't end with ';'.")})
|
||||
@stream.queue += charStack[entityName.length..-1]
|
||||
end
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Named entity expected. Got none.")})
|
||||
@stream.queue += charStack
|
||||
end
|
||||
end
|
||||
return char
|
||||
end
|
||||
|
||||
# This method replaces the need for "entityInAttributeValueState".
|
||||
def processEntityInAttribute
|
||||
entity = consumeEntity
|
||||
if entity
|
||||
@currentToken[:data][-1][1] += entity
|
||||
else
|
||||
@currentToken[:data][-1][1] += "&"
|
||||
end
|
||||
end
|
||||
|
||||
# This method is a generic handler for emitting the tags. It also sets
|
||||
# the state to "data" because that's what's needed after a token has been
|
||||
# emitted.
|
||||
def emitCurrentToken
|
||||
# Add token to the queue to be yielded
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
end
|
||||
|
||||
|
||||
# Below are the various tokenizer states worked out.
|
||||
|
||||
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
|
||||
# documents to figure out what the order of the various if and elsif
|
||||
# statements should be.
|
||||
|
||||
def dataState
|
||||
data = @stream.char
|
||||
if data == "&" and (@contentModelFlag == :PCDATA or
|
||||
@contentModelFlag == :RCDATA)
|
||||
@state = @states[:entityData]
|
||||
elsif data == "<" and @contentModelFlag != :PLAINTEXT
|
||||
@state = @states[:tagOpen]
|
||||
elsif data == :EOF
|
||||
# Tokenization ends.
|
||||
return false
|
||||
elsif SPACE_CHARACTERS.include? data
|
||||
# Directly after emitting a token you switch back to the "data
|
||||
# state". At that point SPACE_CHARACTERS are important so they are
|
||||
# emitted separately.
|
||||
# XXX need to check if we don't need a special "spaces" flag on
|
||||
# characters.
|
||||
@tokenQueue.push({:type => :SpaceCharacters, :data =>
|
||||
data + @stream.charsUntil(SPACE_CHARACTERS, true)})
|
||||
else
|
||||
@tokenQueue.push({:type => :Characters, :data =>
|
||||
data + @stream.charsUntil(["&", "<"])})
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def entityDataState
|
||||
entity = consumeEntity
|
||||
if entity
|
||||
@tokenQueue.push({:type => :Characters, :data => entity})
|
||||
else
|
||||
@tokenQueue.push({:type => :Characters, :data => "&"})
|
||||
end
|
||||
@state = @states[:data]
|
||||
return true
|
||||
end
|
||||
|
||||
def tagOpenState
|
||||
data = @stream.char
|
||||
if @contentModelFlag == :PCDATA
|
||||
if data == "!"
|
||||
@state = @states[:markupDeclarationOpen]
|
||||
elsif data == "/"
|
||||
@state = @states[:closeTagOpen]
|
||||
elsif data != :EOF and ASCII_LETTERS.include? data
|
||||
@currentToken =\
|
||||
{:type => :StartTag, :name => data, :data => []}
|
||||
@state = @states[:tagName]
|
||||
elsif data == ">"
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected tag name. Got '>' instead.")})
|
||||
@tokenQueue.push({:type => :Characters, :data => "<>"})
|
||||
@state = @states[:data]
|
||||
elsif data == "?"
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected tag name. Got '?' instead (HTML doesn't " +
|
||||
"support processing instructions).")})
|
||||
@stream.queue.push(data)
|
||||
@state = @states[:bogusComment]
|
||||
else
|
||||
# XXX
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected tag name. Got something else instead")})
|
||||
@tokenQueue.push({:type => :Characters, :data => "<"})
|
||||
@stream.queue.push(data)
|
||||
@state = @states[:data]
|
||||
end
|
||||
else
|
||||
# We know the content model flag is set to either RCDATA or CDATA
|
||||
# now because this state can never be entered with the PLAINTEXT
|
||||
# flag.
|
||||
if data == "/"
|
||||
@state = @states[:closeTagOpen]
|
||||
else
|
||||
@tokenQueue.push({:type => :Characters, :data => "<"})
|
||||
@stream.queue.insert(0, data)
|
||||
@state = @states[:data]
|
||||
end
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def closeTagOpenState
|
||||
if (@contentModelFlag == :RCDATA or @contentModelFlag == :CDATA)
|
||||
if @currentToken
|
||||
charStack = []
|
||||
|
||||
# So far we know that "</" has been consumed. We now need to know
|
||||
# whether the next few characters match the name of last emitted
|
||||
# start tag which also happens to be the currentToken. We also need
|
||||
# to have the character directly after the characters that could
|
||||
# match the start tag name.
|
||||
(@currentToken[:name].length + 1).times do
|
||||
charStack.push(@stream.char)
|
||||
# Make sure we don't get hit by :EOF
|
||||
break if charStack[-1] == :EOF
|
||||
end
|
||||
|
||||
# Since this is just for checking. We put the characters back on
|
||||
# the stack.
|
||||
@stream.queue += charStack
|
||||
end
|
||||
|
||||
if @currentToken and
|
||||
@currentToken[:name].downcase ==
|
||||
charStack[0...-1].join('').downcase and
|
||||
(SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? charStack[-1]
|
||||
# Because the characters are correct we can safely switch to
|
||||
# PCDATA mode now. This also means we don't have to do it when
|
||||
# emitting the end tag token.
|
||||
@contentModelFlag = :PCDATA
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected closing tag after seeing '</'. None found.")})
|
||||
@tokenQueue.push({:type => :Characters, :data => "</"})
|
||||
@state = @states[:data]
|
||||
|
||||
# Need to return here since we don't want the rest of the
|
||||
# method to be walked through.
|
||||
return true
|
||||
end
|
||||
end
|
||||
|
||||
if @contentModelFlag == :PCDATA
|
||||
data = @stream.char
|
||||
if data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected closing tag. Unexpected end of file.")})
|
||||
@tokenQueue.push({:type => :Characters, :data => "</"})
|
||||
@state = @states[:data]
|
||||
elsif ASCII_LETTERS.include? data
|
||||
@currentToken =\
|
||||
{:type => :EndTag, :name => data, :data => []}
|
||||
@state = @states[:tagName]
|
||||
elsif data == ">"
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||
@state = @states[:data]
|
||||
else
|
||||
# XXX data can be _'_...
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
||||
@stream.queue.push(data)
|
||||
@state = @states[:bogusComment]
|
||||
end
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def tagNameState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@state = @states[:beforeAttributeName]
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in the tag name.")})
|
||||
emitCurrentToken
|
||||
elsif ASCII_LETTERS.include? data
|
||||
@currentToken[:name] += data +\
|
||||
@stream.charsUntil(ASCII_LETTERS, true)
|
||||
elsif data == ">"
|
||||
emitCurrentToken
|
||||
elsif data == "<"
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected < character when getting the tag name.")})
|
||||
emitCurrentToken
|
||||
elsif data == "/"
|
||||
processSolidusInTag
|
||||
@state = @states[:beforeAttributeName]
|
||||
else
|
||||
@currentToken[:name] += data
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def beforeAttributeNameState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@stream.charsUntil(SPACE_CHARACTERS, true)
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file. Expected attribute name instead.")})
|
||||
emitCurrentToken
|
||||
elsif ASCII_LETTERS.include? data
|
||||
@currentToken[:data].push([data, ""])
|
||||
@state = @states[:attributeName]
|
||||
elsif data == ">"
|
||||
emitCurrentToken
|
||||
elsif data == "/"
|
||||
processSolidusInTag
|
||||
elsif data == "<"
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected < character. Expected attribute name instead.")})
|
||||
emitCurrentToken
|
||||
else
|
||||
@currentToken[:data].push([data, ""])
|
||||
@state = @states[:attributeName]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def attributeNameState
|
||||
data = @stream.char
|
||||
leavingThisState = true
|
||||
if data == "="
|
||||
@state = @states[:beforeAttributeValue]
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in attribute name.")})
|
||||
emitCurrentToken
|
||||
leavingThisState = false
|
||||
elsif ASCII_LETTERS.include? data
|
||||
@currentToken[:data][-1][0] += data +\
|
||||
@stream.charsUntil(ASCII_LETTERS, true)
|
||||
leavingThisState = false
|
||||
elsif data == ">"
|
||||
# XXX If we emit here the attributes are converted to a dict
|
||||
# without being checked and when the code below runs we error
|
||||
# because data is a dict not a list
|
||||
elsif SPACE_CHARACTERS.include? data
|
||||
@state = @states[:afterAttributeName]
|
||||
elsif data == "/"
|
||||
processSolidusInTag
|
||||
@state = @states[:beforeAttributeName]
|
||||
elsif data == "<"
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected < character in attribute name.")})
|
||||
emitCurrentToken
|
||||
leavingThisState = false
|
||||
else
|
||||
@currentToken[:data][-1][0] += data
|
||||
leavingThisState = false
|
||||
end
|
||||
|
||||
if leavingThisState
|
||||
# Attributes are not dropped at this stage. That happens when the
|
||||
# start tag token is emitted so values can still be safely appended
|
||||
# to attributes, but we do want to report the parse error in time.
|
||||
@currentToken[:data][0...-1].each {|name,value|
|
||||
if @currentToken[:data][-1][0] == name
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Dropped duplicate attribute on tag.")})
|
||||
end
|
||||
}
|
||||
# XXX Fix for above XXX
|
||||
if data == ">"
|
||||
emitCurrentToken
|
||||
end
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def afterAttributeNameState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@stream.charsUntil(SPACE_CHARACTERS, true)
|
||||
elsif data == "="
|
||||
@state = @states[:beforeAttributeValue]
|
||||
elsif data == ">"
|
||||
emitCurrentToken
|
||||
elsif ASCII_LETTERS.include? data
|
||||
@currentToken[:data].push([data, ""])
|
||||
@state = @states[:attributeName]
|
||||
elsif data == "/"
|
||||
processSolidusInTag
|
||||
@state = @states[:beforeAttributeName]
|
||||
elsif data == "<"
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected < character. Expected = or end of tag.")})
|
||||
emitCurrentToken
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file. Expected = or end of tag.")})
|
||||
emitCurrentToken
|
||||
else
|
||||
@currentToken[:data].push([data, ""])
|
||||
@state = @states[:attributeName]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def beforeAttributeValueState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@stream.charsUntil(SPACE_CHARACTERS, true)
|
||||
elsif data == "\""
|
||||
@state = @states[:attributeValueDoubleQuoted]
|
||||
elsif data == "&"
|
||||
@state = @states[:attributeValueUnQuoted]
|
||||
@stream.queue.push(data);
|
||||
elsif data == "'"
|
||||
@state = @states[:attributeValueSingleQuoted]
|
||||
elsif data == ">"
|
||||
emitCurrentToken
|
||||
elsif data == "<"
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected < character. Expected attribute value.")})
|
||||
emitCurrentToken
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file. Expected attribute value.")})
|
||||
emitCurrentToken
|
||||
else
|
||||
@currentToken[:data][-1][1] += data
|
||||
@state = @states[:attributeValueUnQuoted]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def attributeValueDoubleQuotedState
|
||||
data = @stream.char
|
||||
if data == "\""
|
||||
@state = @states[:beforeAttributeName]
|
||||
elsif data == "&"
|
||||
processEntityInAttribute
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in attribute value (\").")})
|
||||
emitCurrentToken
|
||||
else
|
||||
@currentToken[:data][-1][1] += data +\
|
||||
@stream.charsUntil(["\"", "&"])
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def attributeValueSingleQuotedState
|
||||
data = @stream.char
|
||||
if data == "'"
|
||||
@state = @states[:beforeAttributeName]
|
||||
elsif data == "&"
|
||||
processEntityInAttribute
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in attribute value (').")})
|
||||
emitCurrentToken
|
||||
else
|
||||
@currentToken[:data][-1][1] += data +\
|
||||
@stream.charsUntil(["'", "&"])
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def attributeValueUnQuotedState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@state = @states[:beforeAttributeName]
|
||||
elsif data == "&"
|
||||
processEntityInAttribute
|
||||
elsif data == ">"
|
||||
emitCurrentToken
|
||||
elsif data == "<"
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected < character in attribute value.")})
|
||||
emitCurrentToken
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in attribute value.")})
|
||||
emitCurrentToken
|
||||
else
|
||||
@currentToken[:data][-1][1] += data +
|
||||
@stream.charsUntil(["&", ">","<"] + SPACE_CHARACTERS)
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def bogusCommentState
|
||||
# Make a new comment token and give it as value all the characters
|
||||
# until the first > or :EOF (charsUntil checks for :EOF automatically)
|
||||
# and emit it.
|
||||
@tokenQueue.push(
|
||||
{:type => :Comment, :data => @stream.charsUntil((">"))})
|
||||
|
||||
# Eat the character directly after the bogus comment which is either a
|
||||
# ">" or an :EOF.
|
||||
@stream.char
|
||||
@state = @states[:data]
|
||||
return true
|
||||
end
|
||||
|
||||
def markupDeclarationOpenState
|
||||
charStack = [@stream.char, @stream.char]
|
||||
if charStack == ["-", "-"]
|
||||
@currentToken = {:type => :Comment, :data => ""}
|
||||
@state = @states[:comment]
|
||||
else
|
||||
5.times { charStack.push(@stream.char) }
|
||||
# Put in explicit :EOF check
|
||||
if ((not charStack.include? :EOF) and
|
||||
charStack.join("").upcase == "DOCTYPE")
|
||||
@currentToken =\
|
||||
{:type => :Doctype, :name => "", :data => true}
|
||||
@state = @states[:doctype]
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
||||
@stream.queue += charStack
|
||||
@state = @states[:bogusComment]
|
||||
end
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def commentState
|
||||
data = @stream.char
|
||||
if data == "-"
|
||||
@state = @states[:commentDash]
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in comment.")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
else
|
||||
@currentToken[:data] += data + @stream.charsUntil("-")
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def commentDashState
|
||||
data = @stream.char
|
||||
if data == "-"
|
||||
@state = @states[:commentEnd]
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in comment (-)")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
else
|
||||
@currentToken[:data] += "-" + data +\
|
||||
@stream.charsUntil("-")
|
||||
# Consume the next character which is either a "-" or an :EOF as
|
||||
# well so if there's a "-" directly after the "-" we go nicely to
|
||||
# the "comment end state" without emitting a ParseError there.
|
||||
@stream.char
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def commentEndState
|
||||
data = @stream.char
|
||||
if data == ">"
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
elsif data == "-"
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected '-' after '--' found in comment.")})
|
||||
@currentToken[:data] += data
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in comment (--).")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
else
|
||||
# XXX
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected character in comment found.")})
|
||||
@currentToken[:data] += "--" + data
|
||||
@state = @states[:comment]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def doctypeState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@state = @states[:beforeDoctypeName]
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("No space after literal string 'DOCTYPE'.")})
|
||||
@stream.queue.push(data)
|
||||
@state = @states[:beforeDoctypeName]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def beforeDoctypeNameState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
elsif ASCII_LOWERCASE.include? data
|
||||
@currentToken[:name] = data.upcase
|
||||
@state = @states[:doctypeName]
|
||||
elsif data == ">"
|
||||
# Character needs to be consumed per the specification so don't
|
||||
# invoke emitCurrentTokenWithParseError with :data as argument.
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected > character. Expected DOCTYPE name.")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file. Expected DOCTYPE name.")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
else
|
||||
@currentToken[:name] = data
|
||||
@state = @states[:doctypeName]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def doctypeNameState
|
||||
data = @stream.char
|
||||
needsDoctypeCheck = false
|
||||
if SPACE_CHARACTERS.include? data
|
||||
@state = @states[:afterDoctypeName]
|
||||
needsDoctypeCheck = true
|
||||
elsif data == ">"
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
elsif data == :EOF
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in DOCTYPE name.")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
else
|
||||
# We can't just uppercase everything that arrives here. For
|
||||
# instance, non-ASCII characters.
|
||||
if ASCII_LOWERCASE.include? data
|
||||
data = data.upcase
|
||||
end
|
||||
@currentToken[:name] += data
|
||||
needsDoctypeCheck = true
|
||||
end
|
||||
|
||||
# After some iterations through this state it should eventually say
|
||||
# "HTML". Otherwise there's an error.
|
||||
if needsDoctypeCheck and @currentToken[:name] == "HTML"
|
||||
@currentToken[:data] = false
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def afterDoctypeNameState
|
||||
data = @stream.char
|
||||
if SPACE_CHARACTERS.include? data
|
||||
elsif data == ">"
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
elsif data == :EOF
|
||||
@currentToken[:data] = true
|
||||
# XXX EMIT
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected space or '>'. Got '" + data + "'")})
|
||||
@currentToken[:data] = true
|
||||
@state = @states[:bogusDoctype]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def bogusDoctypeState
|
||||
data = @stream.char
|
||||
if data == ">"
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
elsif data == :EOF
|
||||
# XXX EMIT
|
||||
@stream.queue.push(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in bogus doctype.")})
|
||||
@tokenQueue.push(@currentToken)
|
||||
@state = @states[:data]
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
def _(string); string; end
|
||||
end
|
||||
|
||||
end
|
21
vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
vendored
Normal file
21
vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
vendored
Normal file
|
@ -0,0 +1,21 @@
|
|||
module HTML5lib
|
||||
module TreeBuilders
|
||||
|
||||
def self.getTreeBuilder(name)
|
||||
case name.to_s.downcase
|
||||
when 'simpletree' then
|
||||
require 'html5lib/treebuilders/simpletree'
|
||||
SimpleTree::TreeBuilder
|
||||
when 'rexml' then
|
||||
require 'html5lib/treebuilders/rexml'
|
||||
REXMLTree::TreeBuilder
|
||||
when 'hpricot' then
|
||||
require 'html5lib/treebuilders/hpricot'
|
||||
Hpricot::TreeBuilder
|
||||
else
|
||||
raise "Unknown TreeBuilder #{name}"
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
330
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
vendored
Executable file
330
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
vendored
Executable file
|
@ -0,0 +1,330 @@
|
|||
require 'html5lib/constants'
|
||||
|
||||
#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
|
||||
|
||||
module HTML5lib
|
||||
|
||||
# The scope markers are inserted when entering buttons, object elements,
|
||||
# marquees, table cells, and table captions, and are used to prevent formatting
|
||||
# from "leaking" into tables, buttons, object elements, and marquees.
|
||||
Marker = nil
|
||||
|
||||
module TreeBuilders
|
||||
module Base
|
||||
|
||||
class Node
|
||||
# The parent of the current node (or nil for the document node)
|
||||
attr_accessor :parent
|
||||
|
||||
# a list of child nodes of the current node. This must
|
||||
# include all elements but not necessarily other node types
|
||||
attr_accessor :childNodes
|
||||
|
||||
# A list of miscellaneous flags that can be set on the node
|
||||
attr_accessor :_flags
|
||||
|
||||
def initialize(name)
|
||||
@parent = nil
|
||||
@childNodes = []
|
||||
@_flags = []
|
||||
end
|
||||
|
||||
# Insert node as a child of the current node
|
||||
def appendChild(node)
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
# Insert data as text in the current node, positioned before the
|
||||
# start of node insertBefore or to the end of the node's text.
|
||||
def insertText(data, insertBefore = nil)
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
# Insert node as a child of the current node, before refNode in the
|
||||
# list of child nodes. Raises ValueError if refNode is not a child of
|
||||
# the current node
|
||||
def insertBefore(node, refNode)
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
# Remove node from the children of the current node
|
||||
def removeChild(node)
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
# Move all the children of the current node to newParent.
|
||||
# This is needed so that trees that don't store text as nodes move the
|
||||
# text in the correct way
|
||||
def reparentChildren(newParent)
|
||||
#XXX - should this method be made more general?
|
||||
@childNodes.each { |child| newParent.appendChild(child) }
|
||||
@childNodes = []
|
||||
end
|
||||
|
||||
# Return a shallow copy of the current node i.e. a node with the same
|
||||
# name and attributes but with no parent or child nodes
|
||||
def cloneNode
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
# Return true if the node has children or text, false otherwise
|
||||
def hasContent
|
||||
raise NotImplementedError
|
||||
end
|
||||
end
|
||||
|
||||
# Base treebuilder implementation
|
||||
class TreeBuilder
|
||||
|
||||
attr_accessor :openElements
|
||||
|
||||
attr_accessor :activeFormattingElements
|
||||
|
||||
attr_accessor :document
|
||||
|
||||
attr_accessor :headPointer
|
||||
|
||||
attr_accessor :formPointer
|
||||
|
||||
# Class to use for document root
|
||||
documentClass = nil
|
||||
|
||||
# Class to use for HTML elements
|
||||
elementClass = nil
|
||||
|
||||
# Class to use for comments
|
||||
commentClass = nil
|
||||
|
||||
# Class to use for doctypes
|
||||
doctypeClass = nil
|
||||
|
||||
# Fragment class
|
||||
fragmentClass = nil
|
||||
|
||||
def initialize
|
||||
reset
|
||||
end
|
||||
|
||||
def reset
|
||||
@openElements = []
|
||||
@activeFormattingElements = []
|
||||
|
||||
#XXX - rename these to headElement, formElement
|
||||
@headPointer = nil
|
||||
@formPointer = nil
|
||||
|
||||
self.insertFromTable = false
|
||||
|
||||
@document = @documentClass.new
|
||||
end
|
||||
|
||||
def elementInScope(target, tableVariant = false)
|
||||
# Exit early when possible.
|
||||
return true if @openElements[-1].name == target
|
||||
|
||||
# AT How about while true and simply set node to [-1] and set it to
|
||||
# [-2] at the end...
|
||||
@openElements.reverse.each do |element|
|
||||
if element.name == target
|
||||
return true
|
||||
elsif element.name == 'table'
|
||||
return false
|
||||
elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
|
||||
return false
|
||||
elsif element.name == 'html'
|
||||
return false
|
||||
end
|
||||
end
|
||||
assert false # We should never reach this point
|
||||
end
|
||||
|
||||
def reconstructActiveFormattingElements
|
||||
# Within this algorithm the order of steps described in the
|
||||
# specification is not quite the same as the order of steps in the
|
||||
# code. It should still do the same though.
|
||||
|
||||
# Step 1: stop the algorithm when there's nothing to do.
|
||||
return unless @activeFormattingElements
|
||||
|
||||
# Step 2 and step 3: we start with the last element. So i is -1.
|
||||
i = -1
|
||||
entry = @activeFormattingElements[i]
|
||||
return if entry == Marker or @openElements.include?(entry)
|
||||
|
||||
# Step 6
|
||||
until entry == Marker or @openElements.include?(entry)
|
||||
# Step 5: let entry be one earlier in the list.
|
||||
i -= 1
|
||||
begin
|
||||
entry = @activeFormattingElements[i]
|
||||
rescue
|
||||
# Step 4: at this point we need to jump to step 8. By not doing
|
||||
# i += 1 which is also done in step 7 we achieve that.
|
||||
break
|
||||
end
|
||||
end
|
||||
while true
|
||||
# Step 7
|
||||
i += 1
|
||||
|
||||
# Step 8
|
||||
clone = @activeFormattingElements[i].cloneNode
|
||||
|
||||
# Step 9
|
||||
element = insertElement(clone.name, clone.attributes)
|
||||
|
||||
# Step 10
|
||||
@activeFormattingElements[i] = element
|
||||
|
||||
# Step 11
|
||||
break if element == @activeFormattingElements[-1]
|
||||
end
|
||||
end
|
||||
|
||||
def clearActiveFormattingElements
|
||||
{} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
|
||||
end
|
||||
|
||||
# Check if an element exists between the end of the active
|
||||
# formatting elements and the last marker. If it does, return it, else
|
||||
# return false
|
||||
def elementInActiveFormattingElements(name)
|
||||
@activeFormattingElements.reverse.each do |element|
|
||||
# Check for Marker first because if it's a Marker it doesn't have a
|
||||
# name attribute.
|
||||
break if element == Marker
|
||||
return element if element.name == name
|
||||
end
|
||||
return false
|
||||
end
|
||||
|
||||
def insertDoctype(name)
|
||||
@document.appendChild(@doctypeClass.new(name))
|
||||
end
|
||||
|
||||
def insertComment(data, parent = nil)
|
||||
parent = @openElements[-1] if parent.nil?
|
||||
parent.appendChild(@commentClass.new(data))
|
||||
end
|
||||
|
||||
# Create an element but don't insert it anywhere
|
||||
def createElement(name, attributes)
|
||||
element = @elementClass.new(name)
|
||||
element.attributes = attributes
|
||||
return element
|
||||
end
|
||||
|
||||
# Switch the function used to insert an element from the
|
||||
# normal one to the misnested table one and back again
|
||||
def insertFromTable=(value)
|
||||
@insertFromTable = value
|
||||
@insertElement = value ? :insertElementTable : :insertElementNormal
|
||||
end
|
||||
|
||||
def insertElement(name, attributes)
|
||||
send(@insertElement, name, attributes)
|
||||
end
|
||||
|
||||
def insertElementNormal(name, attributes)
|
||||
element = @elementClass.new(name)
|
||||
element.attributes = attributes
|
||||
@openElements[-1].appendChild(element)
|
||||
@openElements.push(element)
|
||||
return element
|
||||
end
|
||||
|
||||
# Create an element and insert it into the tree
|
||||
def insertElementTable(name, attributes)
|
||||
element = @elementClass.new(name)
|
||||
element.attributes = attributes
|
||||
if TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)
|
||||
#We should be in the InTable mode. This means we want to do
|
||||
#special magic element rearranging
|
||||
parent, insertBefore = getTableMisnestedNodePosition
|
||||
if insertBefore.nil?
|
||||
parent.appendChild(element)
|
||||
else
|
||||
parent.insertBefore(element, insertBefore)
|
||||
end
|
||||
@openElements.push(element)
|
||||
else
|
||||
return insertElementNormal(name, attributes)
|
||||
end
|
||||
return element
|
||||
end
|
||||
|
||||
def insertText(data, parent = nil)
|
||||
parent = @openElements[-1] if parent.nil?
|
||||
|
||||
if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)))
|
||||
parent.insertText(data)
|
||||
else
|
||||
#We should be in the InTable mode. This means we want to do
|
||||
#special magic element rearranging
|
||||
parent, insertBefore = getTableMisnestedNodePosition
|
||||
parent.insertText(data, insertBefore)
|
||||
end
|
||||
end
|
||||
|
||||
# Get the foster parent element, and sibling to insert before
|
||||
# (or nil) when inserting a misnested table node
|
||||
def getTableMisnestedNodePosition
|
||||
#The foster parent element is the one which comes before the most
|
||||
#recently opened table element
|
||||
#XXX - this is really inelegant
|
||||
lastTable = nil
|
||||
fosterParent = nil
|
||||
insertBefore = nil
|
||||
@openElements.reverse.each do |element|
|
||||
if element.name == "table"
|
||||
lastTable = element
|
||||
break
|
||||
end
|
||||
end
|
||||
if lastTable
|
||||
#XXX - we should really check that this parent is actually a
|
||||
#node here
|
||||
if lastTable.parent
|
||||
fosterParent = lastTable.parent
|
||||
insertBefore = lastTable
|
||||
else
|
||||
fosterParent = @openElements[@openElements.index(lastTable) - 1]
|
||||
end
|
||||
else
|
||||
fosterParent = @openElements[0]
|
||||
end
|
||||
return fosterParent, insertBefore
|
||||
end
|
||||
|
||||
def generateImpliedEndTags(exclude = nil)
|
||||
name = @openElements[-1].name
|
||||
|
||||
if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude)
|
||||
@openElements.pop
|
||||
# XXX This is not entirely what the specification says. We should
|
||||
# investigate it more closely.
|
||||
generateImpliedEndTags(exclude)
|
||||
end
|
||||
end
|
||||
|
||||
def getDocument
|
||||
@document
|
||||
end
|
||||
|
||||
def getFragment
|
||||
#assert @innerHTML
|
||||
fragment = @fragmentClass.new
|
||||
@openElements[0].reparentChildren(fragment)
|
||||
return fragment
|
||||
end
|
||||
|
||||
# Serialize the subtree of node in the format required by unit tests
|
||||
# node - the node from which to start serializing
|
||||
def testSerializer(node)
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
211
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
vendored
Normal file
211
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
vendored
Normal file
|
@ -0,0 +1,211 @@
|
|||
require 'html5lib/treebuilders/base'
|
||||
require 'hpricot'
|
||||
require 'forwardable'
|
||||
|
||||
module HTML5lib
|
||||
module TreeBuilders
|
||||
module Hpricot
|
||||
|
||||
class Node < Base::Node
|
||||
|
||||
extend Forwardable
|
||||
|
||||
def_delegators :@hpricot, :name
|
||||
|
||||
attr_accessor :hpricot
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
@hpricot = self.class.hpricot_class.new name
|
||||
end
|
||||
|
||||
def appendChild(node)
|
||||
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
|
||||
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
childNodes << node
|
||||
hpricot.children << node.hpricot
|
||||
end
|
||||
node.parent = self
|
||||
end
|
||||
|
||||
def removeChild(node)
|
||||
childNodes.delete(node)
|
||||
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
|
||||
node.parent = nil
|
||||
end
|
||||
|
||||
def insertText(data, before = nil)
|
||||
if before
|
||||
insertBefore(TextNode.new(data), before)
|
||||
else
|
||||
appendChild(TextNode.new(data))
|
||||
end
|
||||
end
|
||||
|
||||
def insertBefore(node, refNode)
|
||||
index = childNodes.index(refNode)
|
||||
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
|
||||
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
childNodes.insert(index, node)
|
||||
end
|
||||
end
|
||||
|
||||
def hasContent
|
||||
childNodes.any?
|
||||
end
|
||||
end
|
||||
|
||||
class Element < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Elem
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
|
||||
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
|
||||
end
|
||||
|
||||
def name
|
||||
@hpricot.stag.name
|
||||
end
|
||||
|
||||
def cloneNode
|
||||
attributes.inject(self.class.new(name)) do |node, (name, value)|
|
||||
node.hpricot[name] = value
|
||||
node
|
||||
end
|
||||
end
|
||||
|
||||
# A call to Hpricot::Elem#raw_attributes is built dynamically,
|
||||
# so alterations to the returned value (a hash) will be lost.
|
||||
#
|
||||
# AttributeProxy works around this by forwarding :[]= calls
|
||||
# to the raw_attributes accessor on the element start tag.
|
||||
#
|
||||
class AttributeProxy
|
||||
def initialize(hpricot)
|
||||
@hpricot = hpricot
|
||||
end
|
||||
def []=(k, v)
|
||||
@hpricot.stag.send(stag_attributes_method)[k] = v
|
||||
end
|
||||
def stag_attributes_method
|
||||
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
|
||||
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
|
||||
end
|
||||
def method_missing(*a, &b)
|
||||
@hpricot.attributes.send(*a, &b)
|
||||
end
|
||||
end
|
||||
|
||||
def attributes
|
||||
AttributeProxy.new(@hpricot)
|
||||
end
|
||||
|
||||
def attributes=(attrs)
|
||||
attrs.each { |name, value| @hpricot[name] = value }
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
tree = "\n|#{' ' * indent}<#{name}>"
|
||||
indent += 2
|
||||
attributes.each do |name, value|
|
||||
next if name == 'xmlns'
|
||||
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
|
||||
end
|
||||
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
|
||||
end
|
||||
end
|
||||
|
||||
class Document < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Doc
|
||||
end
|
||||
|
||||
def initialize
|
||||
super(nil)
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentType < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::DocType
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
begin
|
||||
super(name)
|
||||
rescue ArgumentError # needs 3...
|
||||
end
|
||||
|
||||
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentFragment < Element
|
||||
def initialize
|
||||
super('')
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
|
||||
end
|
||||
end
|
||||
|
||||
class TextNode < Node
|
||||
def initialize(data)
|
||||
@hpricot = ::Hpricot::Text.new(data)
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
"\n|#{' ' * indent}\"#{hpricot.content}\""
|
||||
end
|
||||
end
|
||||
|
||||
class CommentNode < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Comment
|
||||
end
|
||||
|
||||
def printTree(indent = 0)
|
||||
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
|
||||
end
|
||||
end
|
||||
|
||||
class TreeBuilder < Base::TreeBuilder
|
||||
def initialize
|
||||
@documentClass = Document
|
||||
@doctypeClass = DocumentType
|
||||
@elementClass = Element
|
||||
@commentClass = CommentNode
|
||||
@fragmentClass = DocumentFragment
|
||||
end
|
||||
|
||||
def testSerializer(node)
|
||||
node.printTree
|
||||
end
|
||||
|
||||
def getDocument
|
||||
@document.hpricot
|
||||
end
|
||||
|
||||
def getFragment
|
||||
@document = super
|
||||
return @document.hpricot.children
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
191
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
vendored
Normal file
191
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
vendored
Normal file
|
@ -0,0 +1,191 @@
|
|||
require 'html5lib/treebuilders/base'
|
||||
require 'rexml/document'
|
||||
require 'forwardable'
|
||||
|
||||
module HTML5lib
|
||||
module TreeBuilders
|
||||
module REXMLTree
|
||||
|
||||
class Node < Base::Node
|
||||
extend Forwardable
|
||||
def_delegators :@rxobj, :name, :attributes
|
||||
attr_accessor :rxobj
|
||||
|
||||
def initialize name
|
||||
super name
|
||||
@rxobj = self.class.rxclass.new name
|
||||
end
|
||||
|
||||
def appendChild node
|
||||
if node.kind_of? TextNode and
|
||||
childNodes.length>0 and childNodes[-1].kind_of? TextNode
|
||||
childNodes[-1].rxobj.value =
|
||||
childNodes[-1].rxobj.to_s + node.rxobj.to_s
|
||||
childNodes[-1].rxobj.raw = true
|
||||
else
|
||||
childNodes.push node
|
||||
rxobj.add node.rxobj
|
||||
end
|
||||
node.parent = self
|
||||
end
|
||||
|
||||
def removeChild node
|
||||
childNodes.delete node
|
||||
rxobj.delete node.rxobj
|
||||
node.parent = nil
|
||||
end
|
||||
|
||||
def insertText data, before=nil
|
||||
if before
|
||||
insertBefore TextNode.new(data), before
|
||||
else
|
||||
appendChild TextNode.new(data)
|
||||
end
|
||||
end
|
||||
|
||||
def insertBefore node, refNode
|
||||
index = childNodes.index(refNode)
|
||||
if node.kind_of? TextNode and index>0 and
|
||||
childNodes[index-1].kind_of? TextNode
|
||||
childNodes[index-1].rxobj.value =
|
||||
childNodes[index-1].rxobj.to_s + node.rxobj.to_s
|
||||
childNodes[index-1].rxobj.raw = true
|
||||
else
|
||||
childNodes.insert index, node
|
||||
end
|
||||
end
|
||||
|
||||
def hasContent
|
||||
return (childNodes.length > 0)
|
||||
end
|
||||
end
|
||||
|
||||
class Element < Node
|
||||
def self.rxclass
|
||||
REXML::Element
|
||||
end
|
||||
|
||||
def initialize name
|
||||
super name
|
||||
end
|
||||
|
||||
def cloneNode
|
||||
newNode = self.class.new name
|
||||
attributes.each {|name,value| newNode.attributes[name] = value}
|
||||
newNode
|
||||
end
|
||||
|
||||
def attributes= value
|
||||
value.each {|name,value| rxobj.attributes[name]=value}
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = "\n|#{' ' * indent}<#{name}>"
|
||||
indent += 2
|
||||
for name, value in attributes
|
||||
next if name == 'xmlns'
|
||||
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
|
||||
end
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
end
|
||||
|
||||
class Document < Node
|
||||
def self.rxclass
|
||||
REXML::Document
|
||||
end
|
||||
|
||||
def initialize
|
||||
super nil
|
||||
end
|
||||
|
||||
def appendChild node
|
||||
if node.kind_of? Element and node.name == 'html'
|
||||
node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
|
||||
end
|
||||
super node
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = "#document"
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent + 2)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentType < Node
|
||||
def self.rxclass
|
||||
REXML::DocType
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
"\n|#{' ' * indent}<!DOCTYPE #{name}>"
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentFragment < Element
|
||||
def initialize
|
||||
super nil
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = ""
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent+2)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
end
|
||||
|
||||
class TextNode < Node
|
||||
def initialize data
|
||||
raw=data.gsub('&','&').gsub('<','<').gsub('>','>')
|
||||
@rxobj = REXML::Text.new(raw, true, nil, true)
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
"\n|#{' ' * indent}\"#{rxobj.value}\""
|
||||
end
|
||||
end
|
||||
|
||||
class CommentNode < Node
|
||||
def self.rxclass
|
||||
REXML::Comment
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
"\n|#{' ' * indent}<!-- #{rxobj.string} -->"
|
||||
end
|
||||
end
|
||||
|
||||
class TreeBuilder < Base::TreeBuilder
|
||||
def initialize
|
||||
@documentClass = Document
|
||||
@doctypeClass = DocumentType
|
||||
@elementClass = Element
|
||||
@commentClass = CommentNode
|
||||
@fragmentClass = DocumentFragment
|
||||
end
|
||||
|
||||
def testSerializer node
|
||||
node.printTree()
|
||||
end
|
||||
|
||||
def getDocument
|
||||
@document.rxobj
|
||||
end
|
||||
|
||||
def getFragment
|
||||
@document = super
|
||||
return @document.rxobj.children
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
178
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/simpletree.rb
vendored
Normal file
178
vendor/plugins/HTML5lib/lib/html5lib/treebuilders/simpletree.rb
vendored
Normal file
|
@ -0,0 +1,178 @@
|
|||
require 'html5lib/treebuilders/base'
|
||||
|
||||
module HTML5lib
|
||||
module TreeBuilders
|
||||
module SimpleTree
|
||||
|
||||
class Node < Base::Node
|
||||
# Node representing an item in the tree.
|
||||
# name - The tag name associated with the node
|
||||
attr_accessor :name
|
||||
|
||||
# The value of the current node (applies to text nodes and
|
||||
# comments
|
||||
attr_accessor :value
|
||||
|
||||
# a dict holding name, value pairs for attributes of the node
|
||||
attr_accessor :attributes
|
||||
|
||||
def initialize name
|
||||
super
|
||||
@name = name
|
||||
@value = nil
|
||||
@attributes = {}
|
||||
end
|
||||
|
||||
def appendChild node
|
||||
if node.kind_of? TextNode and
|
||||
childNodes.length>0 and childNodes[-1].kind_of? TextNode
|
||||
childNodes[-1].value += node.value
|
||||
else
|
||||
childNodes.push node
|
||||
end
|
||||
node.parent = self
|
||||
end
|
||||
|
||||
def removeChild node
|
||||
childNodes.delete node
|
||||
node.parent = nil
|
||||
end
|
||||
|
||||
def cloneNode
|
||||
newNode = self.class.new name
|
||||
attributes.each {|name,value| newNode.attributes[name] = value}
|
||||
newNode.value = value
|
||||
newNode
|
||||
end
|
||||
|
||||
def insertText data, before=nil
|
||||
if before
|
||||
insertBefore TextNode.new(data), before
|
||||
else
|
||||
appendChild TextNode.new(data)
|
||||
end
|
||||
end
|
||||
|
||||
def insertBefore node, refNode
|
||||
index = childNodes.index(refNode)
|
||||
if node.kind_of? TextNode and index>0 and
|
||||
childNodes[index-1].kind_of? TextNode
|
||||
childNodes[index-1].value += node.value
|
||||
else
|
||||
childNodes.insert index, node
|
||||
end
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = "\n|%s%s" % [' '* indent, self.to_s]
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent + 2)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
|
||||
def hasContent
|
||||
return (childNodes.length > 0)
|
||||
end
|
||||
end
|
||||
|
||||
class Element < Node
|
||||
def to_s
|
||||
"<%s>" % name
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = "\n|%s%s" % [' '* indent, self.to_s]
|
||||
indent += 2
|
||||
for name, value in attributes
|
||||
tree += "\n|%s%s=\"%s\"" % [' ' * indent, name, value]
|
||||
end
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
end
|
||||
|
||||
class Document < Node
|
||||
def to_s
|
||||
"#document"
|
||||
end
|
||||
|
||||
def initialize
|
||||
super nil
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = to_s
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent + 2)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentType < Node
|
||||
def to_s
|
||||
"<!DOCTYPE %s>" % name
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentFragment < Element
|
||||
def initialize
|
||||
super nil
|
||||
end
|
||||
|
||||
def printTree indent=0
|
||||
tree = ""
|
||||
for child in childNodes
|
||||
tree += child.printTree(indent+2)
|
||||
end
|
||||
return tree
|
||||
end
|
||||
end
|
||||
|
||||
class TextNode < Node
|
||||
def initialize value
|
||||
super nil
|
||||
@value = value
|
||||
end
|
||||
|
||||
def to_s
|
||||
'"%s"' % value
|
||||
end
|
||||
end
|
||||
|
||||
class CommentNode < Node
|
||||
def initialize value
|
||||
super nil
|
||||
@value = value
|
||||
end
|
||||
|
||||
def to_s
|
||||
"<!-- %s -->" % value
|
||||
end
|
||||
end
|
||||
|
||||
class TreeBuilder < Base::TreeBuilder
|
||||
def initialize
|
||||
@documentClass = Document
|
||||
@doctypeClass = DocumentType
|
||||
@elementClass = Element
|
||||
@commentClass = CommentNode
|
||||
@fragmentClass = DocumentFragment
|
||||
end
|
||||
|
||||
def testSerializer node
|
||||
node.printTree()
|
||||
end
|
||||
|
||||
def getFragment
|
||||
@document = super
|
||||
return @document.childNodes
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Add table
Add a link
Reference in a new issue