Sync with latest HTML5lib and latest Maruku

This commit is contained in:
Jacques Distler 2007-07-04 17:36:59 -05:00
parent 8e92e4a3ab
commit 8ccaad85a5
71 changed files with 1974 additions and 1621 deletions

817
vendor/plugins/HTML5lib/lib/html5/constants.rb vendored Executable file
View file

@ -0,0 +1,817 @@
module HTML5
class EOF < Exception; end
CONTENT_MODEL_FLAGS = [
:PCDATA,
:RCDATA,
:CDATA,
:PLAINTEXT
]
SCOPING_ELEMENTS = %w[
button
caption
html
marquee
object
table
td
th
]
FORMATTING_ELEMENTS = %w[
a
b
big
em
font
i
nobr
s
small
strike
strong
tt
u
]
SPECIAL_ELEMENTS = %w[
address
area
base
basefont
bgsound
blockquote
body
br
center
col
colgroup
dd
dir
div
dl
dt
embed
fieldset
form
frame
frameset
h1
h2
h3
h4
h5
h6
head
hr
iframe
image
img
input
isindex
li
link
listing
menu
meta
noembed
noframes
noscript
ol
optgroup
option
p
param
plaintext
pre
script
select
spacer
style
tbody
textarea
tfoot
thead
title
tr
ul
wbr
]
SPACE_CHARACTERS = %W[
\t
\n
\x0B
\x0C
\x20
\r
]
TABLE_INSERT_MODE_ELEMENTS = %w[
table
tbody
tfoot
thead
tr
]
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
DIGITS = '0'..'9'
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
# Heading elements need to be ordered
HEADING_ELEMENTS = %w[
h1
h2
h3
h4
h5
h6
]
# XXX What about event-source and command?
VOID_ELEMENTS = %w[
base
link
meta
hr
br
img
embed
param
area
col
input
]
CDATA_ELEMENTS = %w[title textarea]
RCDATA_ELEMENTS = %w[
style
script
xmp
iframe
noembed
noframes
noscript
]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],
'img' => %w[ismap],
'audio' => %w[autoplay controls],
'video' => %w[autoplay controls],
'script' => %w[defer async],
'details' => %w[open],
'datagrid' => %w[multiple disabled],
'command' => %w[hidden disabled checked default],
'menu' => %w[autosubmit],
'fieldset' => %w[disabled readonly],
'option' => %w[disabled readonly selected],
'optgroup' => %w[disabled readonly],
'button' => %w[disabled autofocus],
'input' => %w[disabled readonly required autofocus checked ismap],
'select' => %w[disabled readonly autofocus multiple],
'output' => %w[disabled readonly]
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN
65533, # 0x81 UNDEFINED
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
8224, # 0x86 0x2020 DAGGER
8225, # 0x87 0x2021 DOUBLE DAGGER
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
8240, # 0x89 0x2030 PER MILLE SIGN
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
65533, # 0x8D UNDEFINED
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
65533, # 0x8F UNDEFINED
65533, # 0x90 UNDEFINED
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
8226, # 0x95 0x2022 BULLET
8211, # 0x96 0x2013 EN DASH
8212, # 0x97 0x2014 EM DASH
732, # 0x98 0x02DC SMALL TILDE
8482, # 0x99 0x2122 TRADE MARK SIGN
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
65533, # 0x9D UNDEFINED
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
]
# ENTITIES was generated from Python using the following code:
#
# import constants
# entities = constants.entities.items()
# entities.sort()
# list = [ ' '.join([repr(entity), '=>', ord(value)<128 and
# repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
# for entity, value in entities]
# print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }'
ENTITIES = {
'AElig' => "\xc3\x86",
'AElig;' => "\xc3\x86",
'AMP' => '&',
'AMP;' => '&',
'Aacute' => "\xc3\x81",
'Aacute;' => "\xc3\x81",
'Acirc' => "\xc3\x82",
'Acirc;' => "\xc3\x82",
'Agrave' => "\xc3\x80",
'Agrave;' => "\xc3\x80",
'Alpha;' => "\xce\x91",
'Aring' => "\xc3\x85",
'Aring;' => "\xc3\x85",
'Atilde' => "\xc3\x83",
'Atilde;' => "\xc3\x83",
'Auml' => "\xc3\x84",
'Auml;' => "\xc3\x84",
'Beta;' => "\xce\x92",
'COPY' => "\xc2\xa9",
'COPY;' => "\xc2\xa9",
'Ccedil' => "\xc3\x87",
'Ccedil;' => "\xc3\x87",
'Chi;' => "\xce\xa7",
'Dagger;' => "\xe2\x80\xa1",
'Delta;' => "\xce\x94",
'ETH' => "\xc3\x90",
'ETH;' => "\xc3\x90",
'Eacute' => "\xc3\x89",
'Eacute;' => "\xc3\x89",
'Ecirc' => "\xc3\x8a",
'Ecirc;' => "\xc3\x8a",
'Egrave' => "\xc3\x88",
'Egrave;' => "\xc3\x88",
'Epsilon;' => "\xce\x95",
'Eta;' => "\xce\x97",
'Euml' => "\xc3\x8b",
'Euml;' => "\xc3\x8b",
'GT' => '>',
'GT;' => '>',
'Gamma;' => "\xce\x93",
'Iacute' => "\xc3\x8d",
'Iacute;' => "\xc3\x8d",
'Icirc' => "\xc3\x8e",
'Icirc;' => "\xc3\x8e",
'Igrave' => "\xc3\x8c",
'Igrave;' => "\xc3\x8c",
'Iota;' => "\xce\x99",
'Iuml' => "\xc3\x8f",
'Iuml;' => "\xc3\x8f",
'Kappa;' => "\xce\x9a",
'LT' => '<',
'LT;' => '<',
'Lambda;' => "\xce\x9b",
'Mu;' => "\xce\x9c",
'Ntilde' => "\xc3\x91",
'Ntilde;' => "\xc3\x91",
'Nu;' => "\xce\x9d",
'OElig;' => "\xc5\x92",
'Oacute' => "\xc3\x93",
'Oacute;' => "\xc3\x93",
'Ocirc' => "\xc3\x94",
'Ocirc;' => "\xc3\x94",
'Ograve' => "\xc3\x92",
'Ograve;' => "\xc3\x92",
'Omega;' => "\xce\xa9",
'Omicron;' => "\xce\x9f",
'Oslash' => "\xc3\x98",
'Oslash;' => "\xc3\x98",
'Otilde' => "\xc3\x95",
'Otilde;' => "\xc3\x95",
'Ouml' => "\xc3\x96",
'Ouml;' => "\xc3\x96",
'Phi;' => "\xce\xa6",
'Pi;' => "\xce\xa0",
'Prime;' => "\xe2\x80\xb3",
'Psi;' => "\xce\xa8",
'QUOT' => '"',
'QUOT;' => '"',
'REG' => "\xc2\xae",
'REG;' => "\xc2\xae",
'Rho;' => "\xce\xa1",
'Scaron;' => "\xc5\xa0",
'Sigma;' => "\xce\xa3",
'THORN' => "\xc3\x9e",
'THORN;' => "\xc3\x9e",
'TRADE;' => "\xe2\x84\xa2",
'Tau;' => "\xce\xa4",
'Theta;' => "\xce\x98",
'Uacute' => "\xc3\x9a",
'Uacute;' => "\xc3\x9a",
'Ucirc' => "\xc3\x9b",
'Ucirc;' => "\xc3\x9b",
'Ugrave' => "\xc3\x99",
'Ugrave;' => "\xc3\x99",
'Upsilon;' => "\xce\xa5",
'Uuml' => "\xc3\x9c",
'Uuml;' => "\xc3\x9c",
'Xi;' => "\xce\x9e",
'Yacute' => "\xc3\x9d",
'Yacute;' => "\xc3\x9d",
'Yuml;' => "\xc5\xb8",
'Zeta;' => "\xce\x96",
'aacute' => "\xc3\xa1",
'aacute;' => "\xc3\xa1",
'acirc' => "\xc3\xa2",
'acirc;' => "\xc3\xa2",
'acute' => "\xc2\xb4",
'acute;' => "\xc2\xb4",
'aelig' => "\xc3\xa6",
'aelig;' => "\xc3\xa6",
'agrave' => "\xc3\xa0",
'agrave;' => "\xc3\xa0",
'alefsym;' => "\xe2\x84\xb5",
'alpha;' => "\xce\xb1",
'amp' => '&',
'amp;' => '&',
'and;' => "\xe2\x88\xa7",
'ang;' => "\xe2\x88\xa0",
'apos;' => "'",
'aring' => "\xc3\xa5",
'aring;' => "\xc3\xa5",
'asymp;' => "\xe2\x89\x88",
'atilde' => "\xc3\xa3",
'atilde;' => "\xc3\xa3",
'auml' => "\xc3\xa4",
'auml;' => "\xc3\xa4",
'bdquo;' => "\xe2\x80\x9e",
'beta;' => "\xce\xb2",
'brvbar' => "\xc2\xa6",
'brvbar;' => "\xc2\xa6",
'bull;' => "\xe2\x80\xa2",
'cap;' => "\xe2\x88\xa9",
'ccedil' => "\xc3\xa7",
'ccedil;' => "\xc3\xa7",
'cedil' => "\xc2\xb8",
'cedil;' => "\xc2\xb8",
'cent' => "\xc2\xa2",
'cent;' => "\xc2\xa2",
'chi;' => "\xcf\x87",
'circ;' => "\xcb\x86",
'clubs;' => "\xe2\x99\xa3",
'cong;' => "\xe2\x89\x85",
'copy' => "\xc2\xa9",
'copy;' => "\xc2\xa9",
'crarr;' => "\xe2\x86\xb5",
'cup;' => "\xe2\x88\xaa",
'curren' => "\xc2\xa4",
'curren;' => "\xc2\xa4",
'dArr;' => "\xe2\x87\x93",
'dagger;' => "\xe2\x80\xa0",
'darr;' => "\xe2\x86\x93",
'deg' => "\xc2\xb0",
'deg;' => "\xc2\xb0",
'delta;' => "\xce\xb4",
'diams;' => "\xe2\x99\xa6",
'divide' => "\xc3\xb7",
'divide;' => "\xc3\xb7",
'eacute' => "\xc3\xa9",
'eacute;' => "\xc3\xa9",
'ecirc' => "\xc3\xaa",
'ecirc;' => "\xc3\xaa",
'egrave' => "\xc3\xa8",
'egrave;' => "\xc3\xa8",
'empty;' => "\xe2\x88\x85",
'emsp;' => "\xe2\x80\x83",
'ensp;' => "\xe2\x80\x82",
'epsilon;' => "\xce\xb5",
'equiv;' => "\xe2\x89\xa1",
'eta;' => "\xce\xb7",
'eth' => "\xc3\xb0",
'eth;' => "\xc3\xb0",
'euml' => "\xc3\xab",
'euml;' => "\xc3\xab",
'euro;' => "\xe2\x82\xac",
'exist;' => "\xe2\x88\x83",
'fnof;' => "\xc6\x92",
'forall;' => "\xe2\x88\x80",
'frac12' => "\xc2\xbd",
'frac12;' => "\xc2\xbd",
'frac14' => "\xc2\xbc",
'frac14;' => "\xc2\xbc",
'frac34' => "\xc2\xbe",
'frac34;' => "\xc2\xbe",
'frasl;' => "\xe2\x81\x84",
'gamma;' => "\xce\xb3",
'ge;' => "\xe2\x89\xa5",
'gt' => '>',
'gt;' => '>',
'hArr;' => "\xe2\x87\x94",
'harr;' => "\xe2\x86\x94",
'hearts;' => "\xe2\x99\xa5",
'hellip;' => "\xe2\x80\xa6",
'iacute' => "\xc3\xad",
'iacute;' => "\xc3\xad",
'icirc' => "\xc3\xae",
'icirc;' => "\xc3\xae",
'iexcl' => "\xc2\xa1",
'iexcl;' => "\xc2\xa1",
'igrave' => "\xc3\xac",
'igrave;' => "\xc3\xac",
'image;' => "\xe2\x84\x91",
'infin;' => "\xe2\x88\x9e",
'int;' => "\xe2\x88\xab",
'iota;' => "\xce\xb9",
'iquest' => "\xc2\xbf",
'iquest;' => "\xc2\xbf",
'isin;' => "\xe2\x88\x88",
'iuml' => "\xc3\xaf",
'iuml;' => "\xc3\xaf",
'kappa;' => "\xce\xba",
'lArr;' => "\xe2\x87\x90",
'lambda;' => "\xce\xbb",
'lang;' => "\xe3\x80\x88",
'laquo' => "\xc2\xab",
'laquo;' => "\xc2\xab",
'larr;' => "\xe2\x86\x90",
'lceil;' => "\xe2\x8c\x88",
'ldquo;' => "\xe2\x80\x9c",
'le;' => "\xe2\x89\xa4",
'lfloor;' => "\xe2\x8c\x8a",
'lowast;' => "\xe2\x88\x97",
'loz;' => "\xe2\x97\x8a",
'lrm;' => "\xe2\x80\x8e",
'lsaquo;' => "\xe2\x80\xb9",
'lsquo;' => "\xe2\x80\x98",
'lt' => '<',
'lt;' => '<',
'macr' => "\xc2\xaf",
'macr;' => "\xc2\xaf",
'mdash;' => "\xe2\x80\x94",
'micro' => "\xc2\xb5",
'micro;' => "\xc2\xb5",
'middot' => "\xc2\xb7",
'middot;' => "\xc2\xb7",
'minus;' => "\xe2\x88\x92",
'mu;' => "\xce\xbc",
'nabla;' => "\xe2\x88\x87",
'nbsp' => "\xc2\xa0",
'nbsp;' => "\xc2\xa0",
'ndash;' => "\xe2\x80\x93",
'ne;' => "\xe2\x89\xa0",
'ni;' => "\xe2\x88\x8b",
'not' => "\xc2\xac",
'not;' => "\xc2\xac",
'notin;' => "\xe2\x88\x89",
'nsub;' => "\xe2\x8a\x84",
'ntilde' => "\xc3\xb1",
'ntilde;' => "\xc3\xb1",
'nu;' => "\xce\xbd",
'oacute' => "\xc3\xb3",
'oacute;' => "\xc3\xb3",
'ocirc' => "\xc3\xb4",
'ocirc;' => "\xc3\xb4",
'oelig;' => "\xc5\x93",
'ograve' => "\xc3\xb2",
'ograve;' => "\xc3\xb2",
'oline;' => "\xe2\x80\xbe",
'omega;' => "\xcf\x89",
'omicron;' => "\xce\xbf",
'oplus;' => "\xe2\x8a\x95",
'or;' => "\xe2\x88\xa8",
'ordf' => "\xc2\xaa",
'ordf;' => "\xc2\xaa",
'ordm' => "\xc2\xba",
'ordm;' => "\xc2\xba",
'oslash' => "\xc3\xb8",
'oslash;' => "\xc3\xb8",
'otilde' => "\xc3\xb5",
'otilde;' => "\xc3\xb5",
'otimes;' => "\xe2\x8a\x97",
'ouml' => "\xc3\xb6",
'ouml;' => "\xc3\xb6",
'para' => "\xc2\xb6",
'para;' => "\xc2\xb6",
'part;' => "\xe2\x88\x82",
'permil;' => "\xe2\x80\xb0",
'perp;' => "\xe2\x8a\xa5",
'phi;' => "\xcf\x86",
'pi;' => "\xcf\x80",
'piv;' => "\xcf\x96",
'plusmn' => "\xc2\xb1",
'plusmn;' => "\xc2\xb1",
'pound' => "\xc2\xa3",
'pound;' => "\xc2\xa3",
'prime;' => "\xe2\x80\xb2",
'prod;' => "\xe2\x88\x8f",
'prop;' => "\xe2\x88\x9d",
'psi;' => "\xcf\x88",
'quot' => '"',
'quot;' => '"',
'rArr;' => "\xe2\x87\x92",
'radic;' => "\xe2\x88\x9a",
'rang;' => "\xe3\x80\x89",
'raquo' => "\xc2\xbb",
'raquo;' => "\xc2\xbb",
'rarr;' => "\xe2\x86\x92",
'rceil;' => "\xe2\x8c\x89",
'rdquo;' => "\xe2\x80\x9d",
'real;' => "\xe2\x84\x9c",
'reg' => "\xc2\xae",
'reg;' => "\xc2\xae",
'rfloor;' => "\xe2\x8c\x8b",
'rho;' => "\xcf\x81",
'rlm;' => "\xe2\x80\x8f",
'rsaquo;' => "\xe2\x80\xba",
'rsquo;' => "\xe2\x80\x99",
'sbquo;' => "\xe2\x80\x9a",
'scaron;' => "\xc5\xa1",
'sdot;' => "\xe2\x8b\x85",
'sect' => "\xc2\xa7",
'sect;' => "\xc2\xa7",
'shy' => "\xc2\xad",
'shy;' => "\xc2\xad",
'sigma;' => "\xcf\x83",
'sigmaf;' => "\xcf\x82",
'sim;' => "\xe2\x88\xbc",
'spades;' => "\xe2\x99\xa0",
'sub;' => "\xe2\x8a\x82",
'sube;' => "\xe2\x8a\x86",
'sum;' => "\xe2\x88\x91",
'sup1' => "\xc2\xb9",
'sup1;' => "\xc2\xb9",
'sup2' => "\xc2\xb2",
'sup2;' => "\xc2\xb2",
'sup3' => "\xc2\xb3",
'sup3;' => "\xc2\xb3",
'sup;' => "\xe2\x8a\x83",
'supe;' => "\xe2\x8a\x87",
'szlig' => "\xc3\x9f",
'szlig;' => "\xc3\x9f",
'tau;' => "\xcf\x84",
'there4;' => "\xe2\x88\xb4",
'theta;' => "\xce\xb8",
'thetasym;' => "\xcf\x91",
'thinsp;' => "\xe2\x80\x89",
'thorn' => "\xc3\xbe",
'thorn;' => "\xc3\xbe",
'tilde;' => "\xcb\x9c",
'times' => "\xc3\x97",
'times;' => "\xc3\x97",
'trade;' => "\xe2\x84\xa2",
'uArr;' => "\xe2\x87\x91",
'uacute' => "\xc3\xba",
'uacute;' => "\xc3\xba",
'uarr;' => "\xe2\x86\x91",
'ucirc' => "\xc3\xbb",
'ucirc;' => "\xc3\xbb",
'ugrave' => "\xc3\xb9",
'ugrave;' => "\xc3\xb9",
'uml' => "\xc2\xa8",
'uml;' => "\xc2\xa8",
'upsih;' => "\xcf\x92",
'upsilon;' => "\xcf\x85",
'uuml' => "\xc3\xbc",
'uuml;' => "\xc3\xbc",
'weierp;' => "\xe2\x84\x98",
'xi;' => "\xce\xbe",
'yacute' => "\xc3\xbd",
'yacute;' => "\xc3\xbd",
'yen' => "\xc2\xa5",
'yen;' => "\xc2\xa5",
'yuml' => "\xc3\xbf",
'yuml;' => "\xc3\xbf",
'zeta;' => "\xce\xb6",
'zwj;' => "\xe2\x80\x8d",
'zwnj;' => "\xe2\x80\x8c"
}
ENCODINGS = %w[
ansi_x3.4-1968
iso-ir-6
ansi_x3.4-1986
iso_646.irv:1991
ascii
iso646-us
us-ascii
us
ibm367
cp367
csascii
ks_c_5601-1987
korean
iso-2022-kr
csiso2022kr
euc-kr
iso-2022-jp
csiso2022jp
iso-2022-jp-2
iso-ir-58
chinese
csiso58gb231280
iso_8859-1:1987
iso-ir-100
iso_8859-1
iso-8859-1
latin1
l1
ibm819
cp819
csisolatin1
iso_8859-2:1987
iso-ir-101
iso_8859-2
iso-8859-2
latin2
l2
csisolatin2
iso_8859-3:1988
iso-ir-109
iso_8859-3
iso-8859-3
latin3
l3
csisolatin3
iso_8859-4:1988
iso-ir-110
iso_8859-4
iso-8859-4
latin4
l4
csisolatin4
iso_8859-6:1987
iso-ir-127
iso_8859-6
iso-8859-6
ecma-114
asmo-708
arabic
csisolatinarabic
iso_8859-7:1987
iso-ir-126
iso_8859-7
iso-8859-7
elot_928
ecma-118
greek
greek8
csisolatingreek
iso_8859-8:1988
iso-ir-138
iso_8859-8
iso-8859-8
hebrew
csisolatinhebrew
iso_8859-5:1988
iso-ir-144
iso_8859-5
iso-8859-5
cyrillic
csisolatincyrillic
iso_8859-9:1989
iso-ir-148
iso_8859-9
iso-8859-9
latin5
l5
csisolatin5
iso-8859-10
iso-ir-157
l6
iso_8859-10:1992
csisolatin6
latin6
hp-roman8
roman8
r8
ibm037
cp037
csibm037
ibm424
cp424
csibm424
ibm437
cp437
437
cspc8codepage437
ibm500
cp500
csibm500
ibm775
cp775
cspc775baltic
ibm850
cp850
850
cspc850multilingual
ibm852
cp852
852
cspcp852
ibm855
cp855
855
csibm855
ibm857
cp857
857
csibm857
ibm860
cp860
860
csibm860
ibm861
cp861
861
cp-is
csibm861
ibm862
cp862
862
cspc862latinhebrew
ibm863
cp863
863
csibm863
ibm864
cp864
csibm864
ibm865
cp865
865
csibm865
ibm866
cp866
866
csibm866
ibm869
cp869
869
cp-gr
csibm869
ibm1026
cp1026
csibm1026
koi8-r
cskoi8r
koi8-u
big5-hkscs
ptcp154
csptcp154
pt154
cp154
utf-7
utf-16be
utf-16le
utf-16
utf-8
iso-8859-13
iso-8859-14
iso-ir-199
iso_8859-14:1998
iso_8859-14
latin8
iso-celtic
l8
iso-8859-15
iso_8859-15
iso-8859-16
iso-ir-226
iso_8859-16:2001
iso_8859-16
latin10
l10
gbk
cp936
ms936
gb18030
shift_jis
ms_kanji
csshiftjis
euc-jp
gb2312
big5
csbig5
windows-1250
windows-1251
windows-1252
windows-1253
windows-1254
windows-1255
windows-1256
windows-1257
windows-1258
tis-620
hz-gb-2312
]
end

View file

@ -0,0 +1 @@
require 'html5/filters/optionaltags'

View file

@ -0,0 +1,10 @@
require 'delegate'
require 'enumerator'
module HTML5
module Filters
class Base < SimpleDelegator
include Enumerable
end
end
end

View file

@ -0,0 +1,85 @@
require 'html5/filters/base'
module HTML5
module Filters
class InjectMetaCharset < Base
def initialize(source, encoding)
super(source)
@encoding = encoding
end
def each
state = :pre_head
meta_found = @encoding.nil?
pending = []
__getobj__.each do |token|
case token[:type]
when :StartTag
state = :in_head if token[:name].downcase == "head"
when :EmptyTag
if token[:name].downcase == "meta"
# replace charset with actual encoding
token[:data].each_with_index do |(name,value),index|
if name == 'charset'
token[:data][index][1]=@encoding
meta_found = true
end
end
# replace charset with actual encoding
has_http_equiv_content_type = false
content_index = -1
token[:data].each_with_index do |(name,value),i|
if name.downcase == 'charset'
token[:data][i] = ['charset', @encoding]
meta_found = true
break
elsif name == 'http-equiv' and value.downcase == 'content-type'
has_http_equiv_content_type = true
elsif name == 'content'
content_index = i
end
end
if not meta_found
if has_http_equiv_content_type and content_index >= 0
token[:data][content_index][1] =
'text/html; charset=%s' % @encoding
meta_found = true
end
end
elsif token[:name].downcase == "head" and not meta_found
# insert meta into empty head
yield(:type => :StartTag, :name => "head", :data => token[:data])
yield(:type => :EmptyTag, :name => "meta",
:data => [["charset", @encoding]])
yield(:type => :EndTag, :name => "head")
meta_found = true
next
end
when :EndTag
if token[:name].downcase == "head" and pending.any?
# insert meta into head (if necessary) and flush pending queue
yield pending.shift
yield(:type => :EmptyTag, :name => "meta",
:data => [["charset", @encoding]]) if not meta_found
yield pending.shift while pending.any?
meta_found = true
state = :post_head
end
end
if state == :in_head
pending << token
else
yield token
end
end
end
end
end
end

View file

@ -0,0 +1,199 @@
require 'html5/constants'
require 'html5/filters/base'
module HTML5
module Filters
class OptionalTagFilter < Base
def slider
previous1 = previous2 = nil
__getobj__.each do |token|
yield previous2, previous1, token if previous1 != nil
previous2 = previous1
previous1 = token
end
yield previous2, previous1, nil
end
def each
slider do |previous, token, nexttok|
type = token[:type]
if type == :StartTag
yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
elsif type == :EndTag
yield token unless is_optional_end(token[:name], nexttok)
else
yield token
end
end
end
def is_optional_start(tagname, previous, nexttok)
type = nexttok ? nexttok[:type] : nil
if tagname == 'html'
# An html element's start tag may be omitted if the first thing
# inside the html element is not a space character or a comment.
return ![:Comment, :SpaceCharacters].include?(type)
elsif tagname == 'head'
# A head element's start tag may be omitted if the first thing
# inside the head element is an element.
return type == :StartTag
elsif tagname == 'body'
# A body element's start tag may be omitted if the first thing
# inside the body element is not a space character or a comment,
# except if the first thing inside the body element is a script
# or style element and the node immediately preceding the body
# element is a head element whose end tag has been omitted.
if [:Comment, :SpaceCharacters].include?(type)
return false
elsif type == :StartTag
# XXX: we do not look at the preceding event, so we never omit
# the body element's start tag if it's followed by a script or
# a style element.
return !%w[script style].include?(nexttok[:name])
else
return true
end
elsif tagname == 'colgroup'
# A colgroup element's start tag may be omitted if the first thing
# inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose
# end tag has been omitted.
if type == :StartTag
# XXX: we do not look at the preceding event, so instead we never
# omit the colgroup element's end tag when it is immediately
# followed by another colgroup element. See is_optional_end.
return nexttok[:name] == "col"
else
return false
end
elsif tagname == 'tbody'
# A tbody element's start tag may be omitted if the first thing
# inside the tbody element is a tr element, and if the element is
# not immediately preceeded by a tbody, thead, or tfoot element
# whose end tag has been omitted.
if type == :StartTag
# omit the thead and tfoot elements' end tag when they are
# immediately followed by a tbody element. See is_optional_end.
if previous and previous[:type] == :EndTag and \
%w(tbody thead tfoot).include?(previous[:name])
return false
end
return nexttok[:name] == 'tr'
else
return false
end
end
return false
end
def is_optional_end(tagname, nexttok)
type = nexttok ? nexttok[:type] : nil
if %w[html head body].include?(tagname)
# An html element's end tag may be omitted if the html element
# is not immediately followed by a space character or a comment.
return ![:Comment, :SpaceCharacters].include?(type)
elsif %w[li optgroup option tr].include?(tagname)
# A li element's end tag may be omitted if the li element is
# immediately followed by another li element or if there is
# no more content in the parent element.
# An optgroup element's end tag may be omitted if the optgroup
# element is immediately followed by another optgroup element,
# or if there is no more content in the parent element.
# An option element's end tag may be omitted if the option
# element is immediately followed by another option element,
# or if there is no more content in the parent element.
# A tr element's end tag may be omitted if the tr element is
# immediately followed by another tr element, or if there is
# no more content in the parent element.
if type == :StartTag
return nexttok[:name] == tagname
else
return type == :EndTag || type == nil
end
elsif %w(dt dd).include?(tagname)
# A dt element's end tag may be omitted if the dt element is
# immediately followed by another dt element or a dd element.
# A dd element's end tag may be omitted if the dd element is
# immediately followed by another dd element or a dt element,
# or if there is no more content in the parent element.
if type == :StartTag
return %w(dt dd).include?(nexttok[:name])
elsif tagname == 'dd'
return type == :EndTag || type == nil
else
return false
end
elsif tagname == 'p'
# A p element's end tag may be omitted if the p element is
# immediately followed by an address, blockquote, dl, fieldset,
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
# or ul element, or if there is no more content in the parent
# element.
if type == :StartTag
return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
h6 hr menu ol p pre table ul).include?(nexttok[:name])
else
return type == :EndTag || type == nil
end
elsif tagname == 'colgroup'
# A colgroup element's end tag may be omitted if the colgroup
# element is not immediately followed by a space character or
# a comment.
if [:Comment, :SpaceCharacters].include?(type)
return false
elsif type == :StartTag
# XXX: we also look for an immediately following colgroup
# element. See is_optional_start.
return nexttok[:name] != 'colgroup'
else
return true
end
elsif %w(thead tbody).include? tagname
# A thead element's end tag may be omitted if the thead element
# is immediately followed by a tbody or tfoot element.
# A tbody element's end tag may be omitted if the tbody element
# is immediately followed by a tbody or tfoot element, or if
# there is no more content in the parent element.
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == :StartTag
return %w(tbody tfoot).include?(nexttok[:name])
elsif tagname == 'tbody'
return (type == :EndTag or type == nil)
else
return false
end
elsif tagname == 'tfoot'
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == :StartTag
return nexttok[:name] == 'tbody'
else
return type == :EndTag || type == nil
end
elsif %w(td th).include? tagname
# A td element's end tag may be omitted if the td element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
# A th element's end tag may be omitted if the th element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
if type == :StartTag
return %w(td th).include?(nexttok[:name])
else
return type == :EndTag || type == nil
end
end
return false
end
end
end
end

View file

@ -0,0 +1,15 @@
require 'html5/filters/base'
require 'html5/sanitizer'
module HTML5
module Filters
class HTMLSanitizeFilter < Base
include HTMLSanitizeModule
def each
__getobj__.each do |token|
yield(sanitize_token(token))
end
end
end
end
end

View file

@ -0,0 +1,36 @@
require 'html5/constants'
require 'html5/filters/base'
module HTML5
module Filters
class WhitespaceFilter < Base
SPACE_PRESERVE_ELEMENTS = %w[pre textarea] + RCDATA_ELEMENTS
SPACES = /[#{SPACE_CHARACTERS.join('')}]+/m
def each
preserve = 0
__getobj__.each do |token|
case token[:type]
when :StartTag
if preserve > 0 or SPACE_PRESERVE_ELEMENTS.include?(token[:name])
preserve += 1
end
when :EndTag
preserve -= 1 if preserve > 0
when :SpaceCharacters
next if preserve == 0
when :Characters
token[:data] = token[:data].sub(SPACES,' ') if preserve == 0
end
yield token
end
end
end
end
end

View file

@ -0,0 +1,246 @@
require 'html5/constants'
require 'html5/tokenizer'
require 'html5/treebuilders/rexml'
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
require 'html5/html5parser/' + File.basename(path)
end
module HTML5
# Error in parsed document
class ParseError < Exception; end
class AssertionError < Exception; end
# HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
#
class HTMLParser
attr_accessor :phase, :firstStartTag, :innerHTML, :lastPhase, :insertFromTable
attr_reader :phases, :tokenizer, :tree, :errors
def self.parse(stream, options = {})
encoding = options.delete(:encoding)
new(options).parse(stream,encoding)
end
def self.parseFragment(stream, options = {})
container = options.delete(:container) || 'div'
encoding = options.delete(:encoding)
new(options).parseFragment(stream,container,encoding)
end
@@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
# :strict - raise an exception when a parse error is encountered
# :tree - a treebuilder class controlling the type of tree that will be
# returned. Built in treebuilders can be accessed through
# HTML5::TreeBuilders[treeType]
def initialize(options = {})
@strict = false
@errors = []
@tokenizer = HTMLTokenizer
@tree = TreeBuilders::REXML::TreeBuilder
options.each { |name, value| instance_variable_set("@#{name}", value) }
@tree = @tree.new
@phases = @@phases.inject({}) do |phases, phase_name|
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
phases
end
end
def _parse(stream, innerHTML, encoding, container = 'div')
@tree.reset
@firstStartTag = false
@errors = []
@tokenizer = @tokenizer.class unless Class === @tokenizer
@tokenizer = @tokenizer.new(stream, :encoding => encoding,
:parseMeta => !innerHTML)
if innerHTML
case @innerHTML = container.downcase
when 'title', 'textarea'
@tokenizer.contentModelFlag = :RCDATA
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
@tokenizer.contentModelFlag = :CDATA
when 'plaintext'
@tokenizer.contentModelFlag = :PLAINTEXT
else
# contentModelFlag already is PCDATA
#@tokenizer.contentModelFlag = :PCDATA
end
@phase = @phases[:rootElement]
@phase.insertHtmlElement
resetInsertionMode
else
@innerHTML = false
@phase = @phases[:initial]
end
# We only seem to have InBodyPhase testcases where the following is
# relevant ... need others too
@lastPhase = nil
# XXX This is temporary for the moment so there isn't any other
# changes needed for the parser to work with the iterable tokenizer
@tokenizer.each do |token|
token = normalizeToken(token)
method = 'process%s' % token[:type]
case token[:type]
when :Characters, :SpaceCharacters, :Comment
@phase.send method, token[:data]
when :StartTag
@phase.send method, token[:name], token[:data]
when :EndTag
@phase.send method, token[:name]
when :Doctype
@phase.send method, token[:name], token[:publicId],
token[:systemId], token[:correct]
else
parseError(token[:data])
end
end
# When the loop finishes it's EOF
@phase.processEOF
end
# Parse a HTML document into a well-formed tree
#
# stream - a filelike object or string containing the HTML to be parsed
#
# The optional encoding parameter must be a string that indicates
# the encoding. If specified, that encoding will be used,
# regardless of any BOM or later declaration (such as in a meta
# element)
def parse(stream, encoding=nil)
_parse(stream, false, encoding)
return @tree.getDocument
end
# Parse a HTML fragment into a well-formed tree fragment
# container - name of the element we're setting the innerHTML property
# if set to nil, default to 'div'
#
# stream - a filelike object or string containing the HTML to be parsed
#
# The optional encoding parameter must be a string that indicates
# the encoding. If specified, that encoding will be used,
# regardless of any BOM or later declaration (such as in a meta
# element)
def parseFragment(stream, container='div', encoding=nil)
_parse(stream, true, encoding, container)
return @tree.getFragment
end
def parseError(data = 'XXX ERROR MESSAGE NEEDED')
# XXX The idea is to make data mandatory.
@errors.push([@tokenizer.stream.position, data])
raise ParseError if @strict
end
# HTML5 specific normalizations to the token stream
def normalizeToken(token)
if token[:type] == :EmptyTag
# When a solidus (/) is encountered within a tag name what happens
# depends on whether the current tag name matches that of a void
# element. If it matches a void element atheists did the wrong
# thing and if it doesn't it's wrong for everyone.
unless VOID_ELEMENTS.include?(token[:name])
parseError(_('Solidus (/) incorrectly placed in tag.'))
end
token[:type] = :StartTag
end
if token[:type] == :StartTag
token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
# We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
unless token[:data].empty?
data = token[:data].reverse.map { |attr, value| [attr.tr(ASCII_UPPERCASE, ASCII_LOWERCASE), value] }
token[:data] = Hash[*data.flatten]
end
elsif token[:type] == :EndTag
parseError(_('End tag contains unexpected attributes.')) unless token[:data].empty?
token[:name] = token[:name].downcase
end
return token
end
@@new_modes = {
'select' => :inSelect,
'td' => :inCell,
'th' => :inCell,
'tr' => :inRow,
'tbody' => :inTableBody,
'thead' => :inTableBody,
'tfoot' => :inTableBody,
'caption' => :inCaption,
'colgroup' => :inColumnGroup,
'table' => :inTable,
'head' => :inBody,
'body' => :inBody,
'frameset' => :inFrameset
}
def resetInsertionMode
# The name of this method is mostly historical. (It's also used in the
# specification.)
last = false
@tree.openElements.reverse.each do |node|
nodeName = node.name
if node == @tree.openElements[0]
last = true
unless ['td', 'th'].include?(nodeName)
# XXX
# assert @innerHTML
nodeName = @innerHTML
end
end
# Check for conditions that should only happen in the innerHTML
# case
if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName)
# XXX
# assert @innerHTML
end
if @@new_modes.has_key?(nodeName)
@phase = @phases[@@new_modes[nodeName]]
elsif nodeName == 'html'
@phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead]
elsif last
@phase = @phases[:inBody]
else
next
end
break
end
end
def _(string); string; end
end
end

View file

@ -0,0 +1,46 @@
require 'html5/html5parser/phase'
module HTML5
class AfterBodyPhase < Phase
handle_end 'html'
def processComment(data)
# This is needed because data is to be appended to the <html> element
# here and not to whatever is currently open.
@tree.insertComment(data, @tree.openElements[0])
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after body phase.'))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processStartTag(name, attributes)
end
def endTagHtml(name)
if @parser.innerHTML
@parser.parseError
else
# XXX: This may need to be done, not sure
# Don't set lastPhase to the current phase but to the inBody phase
# instead. No need for extra parse errors if there's something after </html>.
# Try "<!doctype html>X</html>X" for instance.
@parser.lastPhase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the after body phase."))
@parser.phase = @parser.phases[:inBody]
@parser.phase.processEndTag(name)
end
end
end

View file

@ -0,0 +1,34 @@
require 'html5/html5parser/phase'
module HTML5
class AfterFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#after3
handle_start 'html', 'noframes'
handle_end 'html'
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters in the after frameset phase. Ignored.'))
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in the after frameset phase. Ignored."))
end
def endTagHtml(name)
@parser.lastPhase = @parser.phase
@parser.phase = @parser.phases[:trailingEnd]
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the after frameset phase. Ignored."))
end
end
end

View file

@ -0,0 +1,50 @@
require 'html5/html5parser/phase'
module HTML5
class AfterHeadPhase < Phase
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
def processEOF
anythingElse
@parser.phase.processEOF
end
def processCharacters(data)
anythingElse
@parser.phase.processCharacters(data)
end
def startTagBody(name, attributes)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inBody]
end
def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inFrameset]
end
def startTagFromHead(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that can be in head. Moved."))
@parser.phase = @parser.phases[:inHead]
@parser.phase.processStartTag(name, attributes)
end
def startTagOther(name, attributes)
anythingElse
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
anythingElse
@parser.phase.processEndTag(name)
end
def anythingElse
@tree.insertElement('body', {})
@parser.phase = @parser.phases[:inBody]
end
end
end

View file

@ -0,0 +1,41 @@
require 'html5/html5parser/phase'
module HTML5
class BeforeHeadPhase < Phase
handle_start 'html', 'head'
handle_end %w( html head body br p ) => 'ImplyHead'
def processEOF
startTagHead('head', {})
@parser.phase.processEOF
end
def processCharacters(data)
startTagHead('head', {})
@parser.phase.processCharacters(data)
end
def startTagHead(name, attributes)
@tree.insertElement(name, attributes)
@tree.headPointer = @tree.openElements[-1]
@parser.phase = @parser.phases[:inHead]
end
def startTagOther(name, attributes)
startTagHead('head', {})
@parser.phase.processStartTag(name, attributes)
end
def endTagImplyHead(name)
startTagHead('head', {})
@parser.phase.processEndTag(name)
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) after the (implied) root element."))
end
end
end

View file

@ -0,0 +1,592 @@
require 'html5/html5parser/phase'
module HTML5
class InBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
handle_start 'html'
handle_start %w( base link meta script style ) => 'ProcessInHead'
handle_start 'title'
handle_start 'body', 'form', 'plaintext', 'a', 'button', 'xmp', 'table', 'hr', 'image'
handle_start 'input', 'textarea', 'select', 'isindex', %w( marquee object )
handle_start %w( li dd dt ) => 'ListItem'
handle_start %w( address blockquote center dir div dl fieldset listing menu ol p pre ul ) => 'CloseP'
handle_start %w( b big em font i s small strike strong tt u ) => 'Formatting'
handle_start 'nobr'
handle_start %w( area basefont bgsound br embed img param spacer wbr ) => 'VoidFormatting'
handle_start %w( iframe noembed noframes noscript ) => 'Cdata', HEADING_ELEMENTS => 'Heading'
handle_start %w( caption col colgroup frame frameset head option optgroup tbody td tfoot th thead tr ) => 'Misplaced'
handle_start %w( event-source section nav article aside header footer datagrid command ) => 'New'
handle_end 'p', 'body', 'html', 'form', %w( button marquee object ), %w( dd dt li ) => 'ListItem'
handle_end %w( address blockquote center div dl fieldset listing menu ol pre ul ) => 'Block'
handle_end HEADING_ELEMENTS => 'Heading'
handle_end %w( a b big em font i nobr s small strike strong tt u ) => 'Formatting'
handle_end %w( head frameset select optgroup option table caption colgroup col thead tfoot tbody tr td th ) => 'Misplaced'
handle_end 'br'
handle_end %w( area basefont bgsound embed hr image img input isindex param spacer wbr frame ) => 'None'
handle_end %w( noframes noscript noembed textarea xmp iframe ) => 'CdataTextAreaXmp'
handle_end %w( event-source section nav article aside header footer datagrid command ) => 'New'
def initialize(parser, tree)
super(parser, tree)
# for special handling of whitespace in <pre>
@processSpaceCharactersDropNewline = false
end
def processSpaceCharactersDropNewline(data)
#Sometimes (start of <pre> blocks) we want to drop leading newlines
@processSpaceCharactersDropNewline = false
if (data.length > 0 and data[0] == ?\n and
%w[pre textarea].include?(@tree.openElements[-1].name) and
not @tree.openElements[-1].hasContent)
data = data[1..-1]
end
@tree.insertText(data) if data.length > 0
end
def processSpaceCharacters(data)
if @processSpaceCharactersDropNewline
processSpaceCharactersDropNewline(data)
else
super(data)
end
end
def processCharacters(data)
# XXX The specification says to do this for every character at the
# moment, but apparently that doesn't match the real world so we don't
# do it for space characters.
@tree.reconstructActiveFormattingElements
@tree.insertText(data)
end
def startTagProcessInHead(name, attributes)
@parser.phases[:inHead].processStartTag(name, attributes)
end
def startTagTitle(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) that belongs in the head. Moved."))
@parser.phases[:inHead].processStartTag(name, attributes)
end
def startTagBody(name, attributes)
@parser.parseError(_('Unexpected start tag (body).'))
if (@tree.openElements.length == 1 or
@tree.openElements[1].name != 'body')
assert @parser.innerHTML
else
attributes.each do |attr, value|
unless @tree.openElements[1].attributes.has_key?(attr)
@tree.openElements[1].attributes[attr] = value
end
end
end
end
def startTagCloseP(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@processSpaceCharactersDropNewline = true if name == 'pre'
end
def startTagForm(name, attributes)
if @tree.formPointer
@parser.parseError(_('Unexpected start tag (form). Ignored.'))
else
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.formPointer = @tree.openElements[-1]
end
end
def startTagListItem(name, attributes)
endTagP('p') if in_scope?('p')
stopNames = {'li' => ['li'], 'dd' => ['dd', 'dt'], 'dt' => ['dd', 'dt']}
stopName = stopNames[name]
@tree.openElements.reverse.each_with_index do |node, i|
if stopName.include?(node.name)
poppedNodes = (0..i).collect { @tree.openElements.pop }
if i >= 1
@parser.parseError(_("Missing end tag%s (%s)" % [
(i>1 ? 's' : ''),
poppedNodes.reverse.map {|item| item.name}.join(', ')]))
end
break
end
# Phrasing elements are all non special, non scoping, non
# formatting elements
break if ((SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name) and
not ['address', 'div'].include?(node.name))
end
# Always insert an <li> element.
@tree.insertElement(name, attributes)
end
def startTagPlaintext(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :PLAINTEXT
end
def startTagHeading(name, attributes)
endTagP('p') if in_scope?('p')
# Uncomment the following for IE7 behavior:
# HEADING_ELEMENTS.each do |element|
# if in_scope?(element)
# @parser.parseError(_("Unexpected start tag (#{name})."))
#
# remove_open_elements_until do |element|
# HEADING_ELEMENTS.include?(element.name)
# end
#
# break
# end
# end
@tree.insertElement(name, attributes)
end
def startTagA(name, attributes)
if afeAElement = @tree.elementInActiveFormattingElements('a')
@parser.parseError(_('Unexpected start tag (a) implies end tag (a).'))
endTagFormatting('a')
@tree.openElements.delete(afeAElement) if @tree.openElements.include?(afeAElement)
@tree.activeFormattingElements.delete(afeAElement) if @tree.activeFormattingElements.include?(afeAElement)
end
@tree.reconstructActiveFormattingElements
addFormattingElement(name, attributes)
end
def startTagFormatting(name, attributes)
@tree.reconstructActiveFormattingElements
addFormattingElement(name, attributes)
end
def startTagNobr(name, attributes)
@tree.reconstructActiveFormattingElements
processEndTag('nobr') if in_scope?('nobr')
addFormattingElement(name, attributes)
end
def startTagButton(name, attributes)
if in_scope?('button')
@parser.parseError(_('Unexpected start tag (button) implied end tag (button).'))
processEndTag('button')
@parser.phase.processStartTag(name, attributes)
else
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(Marker)
end
end
def startTagMarqueeObject(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(Marker)
end
def startTagXmp(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagTable(name, attributes)
processEndTag('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inTable]
end
def startTagVoidFormatting(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagHr(name, attributes)
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagImage(name, attributes)
# No really...
@parser.parseError(_('Unexpected start tag (image). Treated as img.'))
processStartTag('img', attributes)
end
def startTagInput(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
if @tree.formPointer
# XXX Not exactly sure what to do here
# @tree.openElements[-1].form = @tree.formPointer
end
@tree.openElements.pop
end
def startTagIsindex(name, attributes)
@parser.parseError(_("Unexpected start tag isindex. Don't use it!"))
return if @tree.formPointer
processStartTag('form', {})
processStartTag('hr', {})
processStartTag('p', {})
processStartTag('label', {})
# XXX Localization ...
processCharacters('This is a searchable index. Insert your search keywords here:')
attributes['name'] = 'isindex'
attrs = attributes.to_a
processStartTag('input', attributes)
processEndTag('label')
processEndTag('p')
processStartTag('hr', {})
processEndTag('form')
end
def startTagTextarea(name, attributes)
# XXX Form element pointer checking here as well...
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :RCDATA
@processSpaceCharactersDropNewline = true
end
# iframe, noembed noframes, noscript(if scripting enabled)
def startTagCdata(name, attributes)
@tree.insertElement(name, attributes)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagSelect(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inSelect]
end
def startTagMisplaced(name, attributes)
# Elements that should be children of other elements that have a
# different insertion mode; here they are ignored
# "caption", "col", "colgroup", "frame", "frameset", "head",
# "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
# "tr", "noscript"
@parser.parseError(_("Unexpected start tag (#{name}). Ignored."))
end
def startTagNew(name, attributes)
# New HTML5 elements, "event-source", "section", "nav",
# "article", "aside", "header", "footer", "datagrid", "command"
sys.stderr.write("Warning: Undefined behaviour for start tag #{name}")
startTagOther(name, attributes)
#raise NotImplementedError
end
def startTagOther(name, attributes)
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, attributes)
end
def endTagP(name)
@tree.generateImpliedEndTags('p') if in_scope?('p')
@parser.parseError(_('Unexpected end tag (p).')) unless @tree.openElements[-1].name == 'p'
if in_scope?('p')
@tree.openElements.pop while in_scope?('p')
else
startTagCloseP('p', {})
endTagP('p')
end
end
def endTagBody(name)
# XXX Need to take open <p> tags into account here. We shouldn't imply
# </p> but we should not throw a parse error either. Specification is
# likely to be updated.
unless @tree.openElements[1].name == 'body'
# innerHTML case
@parser.parseError
return
end
unless @tree.openElements[-1].name == 'body'
@parser.parseError(_("Unexpected end tag (body). Missing end tag (#{@tree.openElements[-1].name})."))
end
@parser.phase = @parser.phases[:afterBody]
end
def endTagHtml(name)
endTagBody(name)
@parser.phase.processEndTag(name) unless @parser.innerHTML
end
def endTagBlock(name)
#Put us back in the right whitespace handling mode
@processSpaceCharactersDropNewline = false if name == 'pre'
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag."))
end
if in_scope?(name)
remove_open_elements_until(name)
end
end
def endTagForm(name)
if in_scope?(name)
@tree.generateImpliedEndTags
end
if @tree.openElements[-1].name != name
@parser.parseError(_("End tag (form) seen too early. Ignored."))
else
@tree.openElements.pop
end
@tree.formPointer = nil
end
def endTagListItem(name)
# AT Could merge this with the Block case
if in_scope?(name)
@tree.generateImpliedEndTags(name)
unless @tree.openElements[-1].name == name
@parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag."))
end
end
remove_open_elements_until(name) if in_scope?(name)
end
def endTagHeading(name)
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
@tree.generateImpliedEndTags
break
end
end
unless @tree.openElements[-1].name == name
@parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag."))
end
HEADING_ELEMENTS.each do |element|
if in_scope?(element)
remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) }
break
end
end
end
# The much-feared adoption agency algorithm
def endTagFormatting(name)
# http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
# XXX Better parseError messages appreciated.
while true
# Step 1 paragraph 1
afeElement = @tree.elementInActiveFormattingElements(name)
if not afeElement or (@tree.openElements.include?(afeElement) and not in_scope?(afeElement.name))
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 1 of the adoption agency algorithm."))
return
# Step 1 paragraph 2
elsif not @tree.openElements.include?(afeElement)
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 2 of the adoption agency algorithm."))
@tree.activeFormattingElements.delete(afeElement)
return
end
# Step 1 paragraph 3
if afeElement != @tree.openElements[-1]
@parser.parseError(_("End tag (#{name}) violates step 1, paragraph 3 of the adoption agency algorithm."))
end
# Step 2
# Start of the adoption agency algorithm proper
afeIndex = @tree.openElements.index(afeElement)
furthestBlock = nil
@tree.openElements[afeIndex..-1].each do |element|
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(element.name)
furthestBlock = element
break
end
end
# Step 3
if furthestBlock.nil?
element = remove_open_elements_until { |element| element == afeElement }
@tree.activeFormattingElements.delete(element)
return
end
commonAncestor = @tree.openElements[afeIndex - 1]
# Step 5
furthestBlock.parent.removeChild(furthestBlock) if furthestBlock.parent
# Step 6
# The bookmark is supposed to help us identify where to reinsert
# nodes in step 12. We have to ensure that we reinsert nodes after
# the node before the active formatting element. Note the bookmark
# can move in step 7.4
bookmark = @tree.activeFormattingElements.index(afeElement)
# Step 7
lastNode = node = furthestBlock
while true
# AT replace this with a function and recursion?
# Node is element before node in open elements
node = @tree.openElements[@tree.openElements.index(node) - 1]
until @tree.activeFormattingElements.include?(node)
tmpNode = node
node = @tree.openElements[@tree.openElements.index(node) - 1]
@tree.openElements.delete(tmpNode)
end
# Step 7.3
break if node == afeElement
# Step 7.4
if lastNode == furthestBlock
# XXX should this be index(node) or index(node)+1
# Anne: I think +1 is ok. Given x = [2,3,4,5]
# x.index(3) gives 1 and then x[1 +1] gives 4...
bookmark = @tree.activeFormattingElements.index(node) + 1
end
# Step 7.5
cite = node.parent
if node.hasContent
clone = node.cloneNode
# Replace node with clone
@tree.activeFormattingElements[@tree.activeFormattingElements.index(node)] = clone
@tree.openElements[@tree.openElements.index(node)] = clone
node = clone
end
# Step 7.6
# Remove lastNode from its parents, if any
lastNode.parent.removeChild(lastNode) if lastNode.parent
node.appendChild(lastNode)
# Step 7.7
lastNode = node
# End of inner loop
end
# Step 8
lastNode.parent.removeChild(lastNode) if lastNode.parent
commonAncestor.appendChild(lastNode)
# Step 9
clone = afeElement.cloneNode
# Step 10
furthestBlock.reparentChildren(clone)
# Step 11
furthestBlock.appendChild(clone)
# Step 12
@tree.activeFormattingElements.delete(afeElement)
@tree.activeFormattingElements.insert([bookmark,@tree.activeFormattingElements.length].min, clone)
# Step 13
@tree.openElements.delete(afeElement)
@tree.openElements.insert(@tree.openElements.index(furthestBlock) + 1, clone)
end
end
def endTagButtonMarqueeObject(name)
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag first."))
end
if in_scope?(name)
remove_open_elements_until(name)
@tree.clearActiveFormattingElements
end
end
def endTagMisplaced(name)
# This handles elements with end tags in other insertion modes.
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagBr(name)
@parser.parseError(_("Unexpected end tag (br). Treated as br element."))
@tree.reconstructActiveFormattingElements
@tree.insertElement(name, {})
@tree.openElements.pop()
end
def endTagNone(name)
# This handles elements with no end tag.
@parser.parseError(_("This tag (#{name}) has no end tag"))
end
def endTagCdataTextAreaXmp(name)
if @tree.openElements[-1].name == name
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagNew(name)
# New HTML5 elements, "event-source", "section", "nav",
# "article", "aside", "header", "footer", "datagrid", "command"
STDERR.puts "Warning: Undefined behaviour for end tag #{name}"
endTagOther(name)
#raise NotImplementedError
end
def endTagOther(name)
# XXX This logic should be moved into the treebuilder
@tree.openElements.reverse.each do |node|
if node.name == name
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == name
@parser.parseError(_("Unexpected end tag (#{name})."))
end
remove_open_elements_until { |element| element == node }
break
else
if (SPECIAL_ELEMENTS + SCOPING_ELEMENTS).include?(node.name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
break
end
end
end
end
protected
def addFormattingElement(name, attributes)
@tree.insertElement(name, attributes)
@tree.activeFormattingElements.push(@tree.openElements[-1])
end
end
end

View file

@ -0,0 +1,68 @@
require 'html5/html5parser/phase'
module HTML5
class InCaptionPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
handle_start 'html', %w( caption col colgroup tbody td tfoot th thead tr ) => 'TableElement'
handle_end 'caption', 'table', %w( body col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def ignoreEndTagCaption
not in_scope?('caption', true)
end
def processCharacters(data)
@parser.phases[:inBody].processCharacters(data)
end
def startTagTableElement(name, attributes)
@parser.parseError
#XXX Have to duplicate logic here to find out if the tag is ignored
ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption')
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def startTagOther(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def endTagCaption(name)
if ignoreEndTagCaption
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
# AT this code is quite similar to endTagTable in "InTable"
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'caption'
@parser.parseError(_("Unexpected end tag (caption). Missing end tags."))
end
remove_open_elements_until('caption')
@tree.clearActiveFormattingElements
@parser.phase = @parser.phases[:inTable]
end
end
def endTagTable(name)
@parser.parseError
ignoreEndTag = ignoreEndTagCaption
@parser.phase.processEndTag('caption')
@parser.phase.processEndTag(name) unless ignoreEndTag
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagOther(name)
@parser.phases[:inBody].processEndTag(name)
end
end
end

View file

@ -0,0 +1,78 @@
require 'html5/html5parser/phase'
module HTML5
class InCellPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
handle_start 'html', %w( caption col colgroup tbody td tfoot th thead tr ) => 'TableOther'
handle_end %w( td th ) => 'TableCell', %w( body caption col colgroup html ) => 'Ignore'
handle_end %w( table tbody tfoot thead tr ) => 'Imply'
def processCharacters(data)
@parser.phases[:inBody].processCharacters(data)
end
def startTagTableOther(name, attributes)
if in_scope?('td', true) or in_scope?('th', true)
closeCell
@parser.phase.processStartTag(name, attributes)
else
# innerHTML case
@parser.parseError
end
end
def startTagOther(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def endTagTableCell(name)
if in_scope?(name, true)
@tree.generateImpliedEndTags(name)
if @tree.openElements[-1].name != name
@parser.parseError("Got table cell end tag (#{name}) while required end tags are missing.")
remove_open_elements_until(name)
else
@tree.openElements.pop
end
@tree.clearActiveFormattingElements
@parser.phase = @parser.phases[:inRow]
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagImply(name)
if in_scope?(name, true)
closeCell
@parser.phase.processEndTag(name)
else
# sometimes innerHTML case
@parser.parseError
end
end
def endTagOther(name)
@parser.phases[:inBody].processEndTag(name)
end
protected
def closeCell
if in_scope?('td', true)
endTagTableCell('td')
elsif in_scope?('th', true)
endTagTableCell('th')
end
end
end
end

View file

@ -0,0 +1,55 @@
require 'html5/html5parser/phase'
module HTML5
class InColumnGroupPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
handle_start 'html', 'col'
handle_end 'colgroup', 'col'
def ignoreEndTagColgroup
@tree.openElements[-1].name == 'html'
end
def processCharacters(data)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup("colgroup")
@parser.phase.processCharacters(data) unless ignoreEndTag
end
def startTagCol(name, attributes)
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagOther(name, attributes)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup('colgroup')
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def endTagColgroup(name)
if ignoreEndTagColgroup
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
@tree.openElements.pop
@parser.phase = @parser.phases[:inTable]
end
end
def endTagCol(name)
@parser.parseError(_('Unexpected end tag (col). col has no end tag.'))
end
def endTagOther(name)
ignoreEndTag = ignoreEndTagColgroup
endTagColgroup('colgroup')
@parser.phase.processEndTag(name) unless ignoreEndTag
end
end
end

View file

@ -0,0 +1,57 @@
require 'html5/html5parser/phase'
module HTML5
class InFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
handle_start 'html', 'frameset', 'frame', 'noframes'
handle_end 'frameset', 'noframes'
def processCharacters(data)
@parser.parseError(_('Unexpected characters in the frameset phase. Characters ignored.'))
end
def startTagFrameset(name, attributes)
@tree.insertElement(name, attributes)
end
def startTagFrame(name, attributes)
@tree.insertElement(name, attributes)
@tree.openElements.pop
end
def startTagNoframes(name, attributes)
@parser.phases[:inBody].processStartTag(name, attributes)
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag token (#{name}) in the frameset phase. Ignored"))
end
def endTagFrameset(name)
if @tree.openElements[-1].name == 'html'
# innerHTML case
@parser.parseError(_("Unexpected end tag token (frameset) in the frameset phase (innerHTML)."))
else
@tree.openElements.pop
end
if (not @parser.innerHTML and
@tree.openElements[-1].name != 'frameset')
# If we're not in innerHTML mode and the the current node is not a
# "frameset" element (anymore) then switch.
@parser.phase = @parser.phases[:afterFrameset]
end
end
def endTagNoframes(name)
@parser.phases[:inBody].processEndTag(name)
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the frameset phase. Ignored."))
end
end
end

View file

@ -0,0 +1,126 @@
require 'html5/html5parser/phase'
module HTML5
class InHeadPhase < Phase
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
handle_end 'head'
handle_end %w( html body br p ) => 'ImplyAfterHead'
handle_end %w( title style script )
def processEOF
if ['title', 'style', 'script'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected end of file. Expected end tag (#{name})."))
@tree.openElements.pop
end
anythingElse
@parser.phase.processEOF
end
def processCharacters(data)
if ['title', 'style', 'script'].include?(@tree.openElements[-1].name)
@tree.insertText(data)
else
anythingElse
@parser.phase.processCharacters(data)
end
end
def startTagHead(name, attributes)
@parser.parseError(_('Unexpected start tag head in existing head. Ignored'))
end
def startTagTitle(name, attributes)
element = @tree.createElement(name, attributes)
appendToHead(element)
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :RCDATA
end
def startTagStyle(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagScript(name, attributes)
#XXX Inner HTML case may be wrong
element = @tree.createElement(name, attributes)
element._flags.push("parser-inserted")
if (@tree.headPointer != nil and
@parser.phase == @parser.phases[:inHead])
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
@tree.openElements.push(element)
@parser.tokenizer.contentModelFlag = :CDATA
end
def startTagBaseLinkMeta(name, attributes)
element = @tree.createElement(name, attributes)
if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead]
appendToHead(element)
else
@tree.openElements[-1].appendChild(element)
end
end
def startTagOther(name, attributes)
anythingElse
@parser.phase.processStartTag(name, attributes)
end
def endTagHead(name)
if @tree.openElements[-1].name == 'head'
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (head). Ignored."))
end
@parser.phase = @parser.phases[:afterHead]
end
def endTagImplyAfterHead(name)
anythingElse
@parser.phase.processEndTag(name)
end
def endTagTitleStyleScript(name)
if @tree.openElements[-1].name == name
@tree.openElements.pop
else
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def anythingElse
if @tree.openElements[-1].name == 'head'
endTagHead('head')
else
@parser.phase = @parser.phases[:afterHead]
end
end
protected
def appendToHead(element)
if @tree.headPointer.nil?
assert @parser.innerHTML
@tree.openElements[-1].appendChild(element)
else
@tree.headPointer.appendChild(element)
end
end
end
end

View file

@ -0,0 +1,87 @@
require 'html5/html5parser/phase'
module HTML5
class InRowPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
handle_start 'html', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead tr ) => 'TableOther'
handle_end 'tr', 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th ) => 'Ignore'
def processCharacters(data)
@parser.phases[:inTable].processCharacters(data)
end
def startTagTableCell(name, attributes)
clearStackToTableRowContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inCell]
@tree.activeFormattingElements.push(Marker)
end
def startTagTableOther(name, attributes)
ignoreEndTag = ignoreEndTagTr
endTagTr('tr')
# XXX how are we sure it's always ignored in the innerHTML case?
@parser.phase.processStartTag(name, attributes) unless ignoreEndTag
end
def startTagOther(name, attributes)
@parser.phases[:inTable].processStartTag(name, attributes)
end
def endTagTr(name)
if ignoreEndTagTr
# innerHTML case
assert @parser.innerHTML
@parser.parseError
else
clearStackToTableRowContext
@tree.openElements.pop
@parser.phase = @parser.phases[:inTableBody]
end
end
def endTagTable(name)
ignoreEndTag = ignoreEndTagTr
endTagTr('tr')
# Reprocess the current tag if the tr end tag was not ignored
# XXX how are we sure it's always ignored in the innerHTML case?
@parser.phase.processEndTag(name) unless ignoreEndTag
end
def endTagTableRowGroup(name)
if in_scope?(name, true)
endTagTr('tr')
@parser.phase.processEndTag(name)
else
# innerHTML case
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the row phase. Ignored."))
end
def endTagOther(name)
@parser.phases[:inTable].processEndTag(name)
end
protected
# XXX unify this with other table helper methods
def clearStackToTableRowContext
until ['tr', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the row phase."))
@tree.openElements.pop
end
end
def ignoreEndTagTr
not in_scope?('tr', :tableVariant => true)
end
end
end

View file

@ -0,0 +1,84 @@
require 'html5/html5parser/phase'
module HTML5
class InSelectPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
handle_start 'html', 'option', 'optgroup', 'select'
handle_end 'option', 'optgroup', 'select', %w( caption table tbody tfoot thead tr td th ) => 'TableElements'
def processCharacters(data)
@tree.insertText(data)
end
def startTagOption(name, attributes)
# We need to imply </option> if <option> is the current node.
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.insertElement(name, attributes)
end
def startTagOptgroup(name, attributes)
@tree.openElements.pop if @tree.openElements[-1].name == 'option'
@tree.openElements.pop if @tree.openElements[-1].name == 'optgroup'
@tree.insertElement(name, attributes)
end
def startTagSelect(name, attributes)
@parser.parseError(_('Unexpected start tag (select) in the select phase implies select start tag.'))
endTagSelect('select')
end
def startTagOther(name, attributes)
@parser.parseError(_('Unexpected start tag token (#{name}) in the select phase. Ignored.'))
end
def endTagOption(name)
if @tree.openElements[-1].name == 'option'
@tree.openElements.pop
else
@parser.parseError(_('Unexpected end tag (option) in the select phase. Ignored.'))
end
end
def endTagOptgroup(name)
# </optgroup> implicitly closes <option>
if @tree.openElements[-1].name == 'option' and @tree.openElements[-2].name == 'optgroup'
@tree.openElements.pop
end
# It also closes </optgroup>
if @tree.openElements[-1].name == 'optgroup'
@tree.openElements.pop
# But nothing else
else
@parser.parseError(_('Unexpected end tag (optgroup) in the select phase. Ignored.'))
end
end
def endTagSelect(name)
if in_scope?('select', true)
remove_open_elements_until('select')
@parser.resetInsertionMode
else
# innerHTML case
@parser.parseError
end
end
def endTagTableElements(name)
@parser.parseError(_("Unexpected table end tag (#{name}) in the select phase."))
if in_scope?(name, true)
endTagSelect('select')
@parser.phase.processEndTag(name)
end
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag token (#{name}) in the select phase. Ignored."))
end
end
end

View file

@ -0,0 +1,83 @@
require 'html5/html5parser/phase'
module HTML5
class InTableBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
handle_start 'html', 'tr', %w( td th ) => 'TableCell', %w( caption col colgroup tbody tfoot thead ) => 'TableOther'
handle_end 'table', %w( tbody tfoot thead ) => 'TableRowGroup', %w( body caption col colgroup html td th tr ) => 'Ingore'
def processCharacters(data)
@parser.phases[:inTable].processCharacters(data)
end
def startTagTr(name, attributes)
clearStackToTableBodyContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inRow]
end
def startTagTableCell(name, attributes)
@parser.parseError(_("Unexpected table cell start tag (#{name}) in the table body phase."))
startTagTr('tr', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagTableOther(name, attributes)
# XXX AT Any ideas on how to share this with endTagTable?
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name)
@parser.phase.processStartTag(name, attributes)
else
# innerHTML case
@parser.parseError
end
end
def startTagOther(name, attributes)
@parser.phases[:inTable].processStartTag(name, attributes)
end
def endTagTableRowGroup(name)
if in_scope?(name, true)
clearStackToTableBodyContext
@tree.openElements.pop
@parser.phase = @parser.phases[:inTable]
else
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end
end
def endTagTable(name)
if in_scope?('tbody', true) or in_scope?('thead', true) or in_scope?('tfoot', true)
clearStackToTableBodyContext
endTagTableRowGroup(@tree.openElements[-1].name)
@parser.phase.processEndTag(name)
else
# innerHTML case
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}) in the table body phase. Ignored."))
end
def endTagOther(name)
@parser.phases[:inTable].processEndTag(name)
end
protected
def clearStackToTableBodyContext
until ['tbody', 'tfoot', 'thead', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table body phase."))
@tree.openElements.pop
end
end
end
end

View file

@ -0,0 +1,110 @@
require 'html5/html5parser/phase'
module HTML5
class InTablePhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
handle_start 'html', 'caption', 'colgroup', 'col', 'table'
handle_start %w( tbody tfoot thead ) => 'RowGroup', %w( td th tr ) => 'ImplyTbody'
handle_end 'table', %w( body caption col colgroup html tbody td tfoot th thead tr ) => 'Ignore'
def processCharacters(data)
@parser.parseError(_("Unexpected non-space characters in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
# Process the character in the "in body" mode
@parser.phases[:inBody].processCharacters(data)
@tree.insertFromTable = false
end
def startTagCaption(name, attributes)
clearStackToTableContext
@tree.activeFormattingElements.push(Marker)
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inCaption]
end
def startTagColgroup(name, attributes)
clearStackToTableContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inColumnGroup]
end
def startTagCol(name, attributes)
startTagColgroup('colgroup', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagRowGroup(name, attributes)
clearStackToTableContext
@tree.insertElement(name, attributes)
@parser.phase = @parser.phases[:inTableBody]
end
def startTagImplyTbody(name, attributes)
startTagRowGroup('tbody', {})
@parser.phase.processStartTag(name, attributes)
end
def startTagTable(name, attributes)
@parser.parseError(_("Unexpected start tag (table) in table phase. Implies end tag (table)."))
@parser.phase.processEndTag('table')
@parser.phase.processStartTag(name, attributes) unless @parser.innerHTML
end
def startTagOther(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
# Process the start tag in the "in body" mode
@parser.phases[:inBody].processStartTag(name, attributes)
@tree.insertFromTable = false
end
def endTagTable(name)
if in_scope?('table', true)
@tree.generateImpliedEndTags
unless @tree.openElements[-1].name == 'table'
@parser.parseError(_("Unexpected end tag (table). Expected end tag (#{@tree.openElements[-1].name})."))
end
remove_open_elements_until('table')
@parser.resetInsertionMode
else
# innerHTML case
assert @parser.innerHTML
@parser.parseError
end
end
def endTagIgnore(name)
@parser.parseError(_("Unexpected end tag (#{name}). Ignored."))
end
def endTagOther(name)
@parser.parseError(_("Unexpected end tag (#{name}) in table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
@tree.insertFromTable = true
# Process the end tag in the "in body" mode
@parser.phases[:inBody].processEndTag(name)
@tree.insertFromTable = false
end
protected
def clearStackToTableContext
# "clear the stack back to a table context"
until ['table', 'html'].include?(name = @tree.openElements[-1].name)
@parser.parseError(_("Unexpected implied end tag (#{name}) in the table phase."))
@tree.openElements.pop
end
# When the current node is <html> it's an innerHTML case
end
end
end

View file

@ -0,0 +1,135 @@
require 'html5/html5parser/phase'
module HTML5
class InitialPhase < Phase
# This phase deals with error handling as well which is currently not
# covered in the specification. The error handling is typically known as
# "quirks mode". It is expected that a future version of HTML5 will define this.
def processEOF
@parser.parseError(_('Unexpected End of file. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processDoctype(name, publicId, systemId, correct)
if name.downcase != 'html' or publicId or systemId
@parser.parseError(_('Erroneous DOCTYPE.'))
end
# XXX need to update DOCTYPE tokens
@tree.insertDoctype(name)
publicId = publicId.to_s.upcase
if name.downcase != 'html'
# XXX quirks mode
else
if ["+//silmaril//dtd html pro v0r11 19970101//en",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
"-//as//dtd html 3.0 aswedit + extensions//en",
"-//ietf//dtd html 2.0 level 1//en",
"-//ietf//dtd html 2.0 level 2//en",
"-//ietf//dtd html 2.0 strict level 1//en",
"-//ietf//dtd html 2.0 strict level 2//en",
"-//ietf//dtd html 2.0 strict//en",
"-//ietf//dtd html 2.0//en",
"-//ietf//dtd html 2.1e//en",
"-//ietf//dtd html 3.0//en",
"-//ietf//dtd html 3.0//en//",
"-//ietf//dtd html 3.2 final//en",
"-//ietf//dtd html 3.2//en",
"-//ietf//dtd html 3//en",
"-//ietf//dtd html level 0//en",
"-//ietf//dtd html level 0//en//2.0",
"-//ietf//dtd html level 1//en",
"-//ietf//dtd html level 1//en//2.0",
"-//ietf//dtd html level 2//en",
"-//ietf//dtd html level 2//en//2.0",
"-//ietf//dtd html level 3//en",
"-//ietf//dtd html level 3//en//3.0",
"-//ietf//dtd html strict level 0//en",
"-//ietf//dtd html strict level 0//en//2.0",
"-//ietf//dtd html strict level 1//en",
"-//ietf//dtd html strict level 1//en//2.0",
"-//ietf//dtd html strict level 2//en",
"-//ietf//dtd html strict level 2//en//2.0",
"-//ietf//dtd html strict level 3//en",
"-//ietf//dtd html strict level 3//en//3.0",
"-//ietf//dtd html strict//en",
"-//ietf//dtd html strict//en//2.0",
"-//ietf//dtd html strict//en//3.0",
"-//ietf//dtd html//en",
"-//ietf//dtd html//en//2.0",
"-//ietf//dtd html//en//3.0",
"-//metrius//dtd metrius presentational//en",
"-//microsoft//dtd internet explorer 2.0 html strict//en",
"-//microsoft//dtd internet explorer 2.0 html//en",
"-//microsoft//dtd internet explorer 2.0 tables//en",
"-//microsoft//dtd internet explorer 3.0 html strict//en",
"-//microsoft//dtd internet explorer 3.0 html//en",
"-//microsoft//dtd internet explorer 3.0 tables//en",
"-//netscape comm. corp.//dtd html//en",
"-//netscape comm. corp.//dtd strict html//en",
"-//o'reilly and associates//dtd html 2.0//en",
"-//o'reilly and associates//dtd html extended 1.0//en",
"-//spyglass//dtd html 2.0 extended//en",
"-//sq//dtd html 2.0 hotmetal + extensions//en",
"-//sun microsystems corp.//dtd hotjava html//en",
"-//sun microsystems corp.//dtd hotjava strict html//en",
"-//w3c//dtd html 3 1995-03-24//en",
"-//w3c//dtd html 3.2 draft//en",
"-//w3c//dtd html 3.2 final//en",
"-//w3c//dtd html 3.2//en",
"-//w3c//dtd html 3.2s draft//en",
"-//w3c//dtd html 4.0 frameset//en",
"-//w3c//dtd html 4.0 transitional//en",
"-//w3c//dtd html experimental 19960712//en",
"-//w3c//dtd html experimental 970421//en",
"-//w3c//dtd w3 html//en",
"-//w3o//dtd w3 html 3.0//en",
"-//w3o//dtd w3 html 3.0//en//",
"-//w3o//dtd w3 html strict 3.0//en//",
"-//webtechs//dtd mozilla html 2.0//en",
"-//webtechs//dtd mozilla html//en",
"-/w3c/dtd html 4.0 transitional/en",
"html"].include?(publicId) or
(systemId == nil and
["-//w3c//dtd html 4.01 frameset//EN",
"-//w3c//dtd html 4.01 transitional//EN"].include?(publicId)) or
(systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")
#XXX quirks mode
end
end
@parser.phase = @parser.phases[:rootElement]
end
def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected DOCTYPE.'))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_("Unexpected start tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
@parser.parseError(_("Unexpected end tag (#{name}). Expected DOCTYPE."))
@parser.phase = @parser.phases[:rootElement]
@parser.phase.processEndTag(name)
end
end
end

View file

@ -0,0 +1,156 @@
module HTML5
# Base class for helper objects that implement each phase of processing.
#
# Handler methods should be in the following order (they can be omitted):
#
# * EOF
# * Comment
# * Doctype
# * SpaceCharacters
# * Characters
# * StartTag
# - startTag* methods
# * EndTag
# - endTag* methods
#
class Phase
# The following example call:
#
# tag_handlers('startTag', 'html', %( base link meta ), %( li dt dd ) => 'ListItem')
#
# ...would return a hash equal to this:
#
# { 'html' => 'startTagHtml',
# 'base' => 'startTagBaseLinkMeta',
# 'link' => 'startTagBaseLinkMeta',
# 'meta' => 'startTagBaseLinkMeta',
# 'li' => 'startTagListItem',
# 'dt' => 'startTagListItem',
# 'dd' => 'startTagListItem' }
#
def self.tag_handlers(prefix, *tags)
mapping = {}
if tags.last.is_a?(Hash)
tags.pop.each do |names, handler_method_suffix|
handler_method = prefix + handler_method_suffix
Array(names).each { |name| mapping[name] = handler_method }
end
end
tags.each do |names|
names = Array(names)
handler_method = prefix + names.map { |name| name.capitalize }.join
names.each { |name| mapping[name] = handler_method }
end
return mapping
end
def self.start_tag_handlers
@start_tag_handlers ||= Hash.new('startTagOther')
end
# Declare what start tags this Phase handles. Can be called more than once.
#
# Example usage:
#
# handle_start 'html'
# # html start tags will be handled by a method named 'startTagHtml'
#
# handle_start %( base link meta )
# # base, link and meta start tags will be handled by a method named 'startTagBaseLinkMeta'
#
# handle_start %( li dt dd ) => 'ListItem'
# # li, dt, and dd start tags will be handled by a method named 'startTagListItem'
#
def self.handle_start(*tags)
start_tag_handlers.update tag_handlers('startTag', *tags)
end
def self.end_tag_handlers
@end_tag_handlers ||= Hash.new('endTagOther')
end
# Declare what end tags this Phase handles. Behaves like handle_start.
#
def self.handle_end(*tags)
end_tag_handlers.update tag_handlers('endTag', *tags)
end
def initialize(parser, tree)
@parser, @tree = parser, tree
end
def processEOF
@tree.generateImpliedEndTags
if @tree.openElements.length > 2
@parser.parseError(_('Unexpected end of file. Missing closing tags.'))
elsif @tree.openElements.length == 2 and @tree.openElements[1].name != 'body'
# This happens for framesets or something?
@parser.parseError(_("Unexpected end of file. Expected end tag (#{@tree.openElements[1].name}) first."))
elsif @parser.innerHTML and @tree.openElements.length > 1
# XXX This is not what the specification says. Not sure what to do here.
@parser.parseError(_('XXX innerHTML EOF'))
end
# Betting ends.
end
def processComment(data)
# For most phases the following is correct. Where it's not it will be
# overridden.
@tree.insertComment(data, @tree.openElements[-1])
end
def processDoctype(name, publicId, systemId, correct)
@parser.parseError(_('Unexpected DOCTYPE. Ignored.'))
end
def processSpaceCharacters(data)
@tree.insertText(data)
end
def processStartTag(name, attributes)
send self.class.start_tag_handlers[name], name, attributes
end
def startTagHtml(name, attributes)
if @parser.firstStartTag == false and name == 'html'
@parser.parseError(_('html needs to be the first start tag.'))
end
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke @parser.parseError.
attributes.each do |attr, value|
unless @tree.openElements[0].attributes.has_key?(attr)
@tree.openElements[0].attributes[attr] = value
end
end
@parser.firstStartTag = false
end
def processEndTag(name)
send self.class.end_tag_handlers[name], name
end
def _(string)
string
end
def assert(value)
throw AssertionError.new unless value
end
def in_scope?(*args)
@tree.elementInScope(*args)
end
def remove_open_elements_until(name=nil)
finished = false
until finished
element = @tree.openElements.pop
finished = name.nil?? yield(element) : element.name == name
end
return element
end
end
end

View file

@ -0,0 +1,43 @@
require 'html5/html5parser/phase'
module HTML5
class RootElementPhase < Phase
def processEOF
insertHtmlElement
@parser.phase.processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processSpaceCharacters(data)
@tree.insertText(data, @tree.document)
end
def processCharacters(data)
insertHtmlElement
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.firstStartTag = true if name == 'html'
insertHtmlElement
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
insertHtmlElement
@parser.phase.processEndTag(name)
end
def insertHtmlElement
element = @tree.createElement('html', {})
@tree.openElements.push(element)
@tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead]
end
end
end

View file

@ -0,0 +1,36 @@
require 'html5/html5parser/phase'
module HTML5
class TrailingEndPhase < Phase
def processEOF
end
def processComment(data)
@tree.insertComment(data, @tree.document)
end
def processSpaceCharacters(data)
@parser.lastPhase.processSpaceCharacters(data)
end
def processCharacters(data)
@parser.parseError(_('Unexpected non-space characters. Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processCharacters(data)
end
def processStartTag(name, attributes)
@parser.parseError(_('Unexpected start tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processStartTag(name, attributes)
end
def processEndTag(name)
@parser.parseError(_('Unexpected end tag (#{name}). Expected end of file.'))
@parser.phase = @parser.lastPhase
@parser.phase.processEndTag(name)
end
end
end

View file

@ -0,0 +1,654 @@
require 'stringio'
require 'html5/constants'
module HTML5
# Provides a unicode stream of characters to the HTMLTokenizer.
# This class takes care of character encoding and removing or replacing
# incorrect byte-sequences and also provides column and line tracking.
class HTMLInputStream
attr_accessor :queue, :char_encoding, :errors
# Initialises the HTMLInputStream.
#
# HTMLInputStream(source, [encoding]) -> Normalized stream from source
# for use by the HTML5Lib.
#
# source can be either a file-object, local filename or a string.
#
# The optional encoding parameter must be a string that indicates
# the encoding. If specified, that encoding will be used,
# regardless of any BOM or later declaration (such as in a meta
# element)
#
# parseMeta - Look for a <meta> element containing encoding information
def initialize(source, options = {})
@encoding = nil
@parse_meta = true
@chardet = true
options.each { |name, value| instance_variable_set("@#{name}", value) }
# Raw Stream
@raw_stream = open_stream(source)
# Encoding Information
#Number of bytes to use when looking for a meta element with
#encoding information
@NUM_BYTES_META = 512
#Number of bytes to use when using detecting encoding using chardet
@NUM_BYTES_CHARDET = 256
#Number of bytes to use when reading content
@NUM_BYTES_BUFFER = 1024
#Encoding to use if no other information can be found
@DEFAULT_ENCODING = 'windows-1252'
#Detect encoding iff no explicit "transport level" encoding is supplied
if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
@char_encoding = detect_encoding
else
@char_encoding = @encoding
end
# Read bytes from stream decoding them into Unicode
@buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
if @char_encoding == 'windows-1252'
@win1252 = true
elsif @char_encoding != 'utf-8'
begin
require 'iconv'
begin
@buffer << @raw_stream.read unless @raw_stream.eof?
@buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
rescue
@win1252 = true
end
rescue LoadError
@win1252 = true
end
end
@queue = []
@errors = []
# Reset position in the list to read from
@tell = 0
@line = @col = 0
@line_lengths = []
end
# Produces a file object from source.
#
# source can be either a file object, local filename or a string.
def open_stream(source)
# Already an IO like object
if source.respond_to?(:read)
@stream = source
else
# Treat source as a string and wrap in StringIO
@stream = StringIO.new(source)
end
return @stream
end
def detect_encoding
#First look for a BOM
#This will also read past the BOM if present
encoding = detect_bom
#If there is no BOM need to look for meta elements with encoding
#information
if encoding.nil? and @parse_meta
encoding = detect_encoding_meta
end
#Guess with chardet, if avaliable
if encoding.nil? and @chardet
begin
require 'rubygems'
require 'UniversalDetector' # gem install chardet
buffers = []
detector = UniversalDetector::Detector.instance
detector.reset
until @raw_stream.eof?
buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
break if !buffer or buffer.empty?
buffers << buffer
detector.feed(buffer)
break if detector.instance_eval {@done}
detector.instance_eval {
@_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
}
end
detector.close
encoding = detector.result['encoding']
seek(buffers*'', 0)
rescue LoadError
end
end
# If all else fails use the default encoding
if encoding.nil?
encoding = @DEFAULT_ENCODING
end
#Substitute for equivalent encodings
encoding_sub = {'iso-8859-1' => 'windows-1252'}
if encoding_sub.has_key?(encoding.downcase)
encoding = encoding_sub[encoding.downcase]
end
return encoding
end
# Attempts to detect at BOM at the start of the stream. If
# an encoding can be determined from the BOM return the name of the
# encoding otherwise return nil
def detect_bom
bom_dict = {
"\xef\xbb\xbf" => 'utf-8',
"\xff\xfe" => 'utf-16le',
"\xfe\xff" => 'utf-16be',
"\xff\xfe\x00\x00" => 'utf-32le',
"\x00\x00\xfe\xff" => 'utf-32be'
}
# Go to beginning of file and read in 4 bytes
string = @raw_stream.read(4)
return nil unless string
# Try detecting the BOM using bytes from the string
encoding = bom_dict[string[0...3]] # UTF-8
seek = 3
unless encoding
# Need to detect UTF-32 before UTF-16
encoding = bom_dict[string] # UTF-32
seek = 4
unless encoding
encoding = bom_dict[string[0...2]] # UTF-16
seek = 2
end
end
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
seek(string, encoding ? seek : 0)
return encoding
end
def seek(buffer, n)
if @raw_stream.respond_to?(:unget)
@raw_stream.unget(buffer[n..-1])
return
end
if @raw_stream.respond_to?(:seek)
begin
@raw_stream.seek(n)
return
rescue Errno::ESPIPE
end
end
require 'delegate'
@raw_stream = SimpleDelegator.new(@raw_stream)
class << @raw_stream
def read(chars=-1)
if chars == -1 or chars > @data.length
result = @data
@data = ''
return result if __getobj__.eof?
return result + __getobj__.read if chars == -1
return result + __getobj__.read(chars-result.length)
elsif @data.empty?
return __getobj__.read(chars)
else
result = @data[1...chars]
@data = @data[chars..-1]
return result
end
end
def unget(data)
if !@data or @data.empty?
@data = data
else
@data += data
end
end
end
@raw_stream.unget(buffer[n .. -1])
end
# Report the encoding declared by the meta element
def detect_encoding_meta
buffer = @raw_stream.read(@NUM_BYTES_META)
parser = EncodingParser.new(buffer)
seek(buffer, 0)
return parser.get_encoding
end
# Returns (line, col) of the current position in the stream.
def position
line, col = @line, @col
@queue.reverse.each do |c|
if c == "\n"
line -= 1
raise RuntimeError.new("col=#{col}") unless col == 0
col = @line_lengths[line]
else
col -= 1
end
end
return [line+1, col]
end
# Read one character from the stream or queue if available. Return
# EOF when EOF is reached.
def char
unless @queue.empty?
return @queue.shift
else
if @tell + 3 > @buffer.length and !@raw_stream.eof?
# read next block
@buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
@tell = 0
end
c = @buffer[@tell]
@tell += 1
case c
when 0x01 .. 0x7F
if c == 0x0D
# normalize newlines
@tell += 1 if @buffer[@tell] == 0x0A
c = 0x0A
end
# update position in stream
if c == 0x0a
@line_lengths << @col
@line += 1
@col = 0
else
@col += 1
end
c.chr
when 0x80 .. 0xBF
if !@win1252
[0xFFFD].pack('U') # invalid utf-8
elsif c <= 0x9f
[ENTITIES_WINDOWS1252[c-0x80]].pack('U')
else
"\xC2" + c.chr # convert to utf-8
end
when 0xC0 .. 0xFF
if @win1252
"\xC3" + (c-64).chr # convert to utf-8
elsif @buffer[@tell-1 .. @tell+3] =~ /^
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)/x
@tell += $1.length - 1
$1
else
[0xFFFD].pack('U') # invalid utf-8
end
when 0x00
@errors.push('null character found in input stream, ' +
'replaced with U+FFFD')
[0xFFFD].pack('U') # null characters are invalid
else
:EOF
end
end
end
# Returns a string of characters from the stream up to but not
# including any character in characters or EOF. characters can be
# any container that supports the in method being called on it.
def chars_until(characters, opposite=false)
char_stack = [char]
while char_stack.last != :EOF
break unless (characters.include?(char_stack.last)) == opposite
char_stack.push(char)
end
# Put the character stopped on back to the front of the queue
# from where it came.
c = char_stack.pop
@queue.insert(0, c) unless c == :EOF
return char_stack.join('')
end
def unget(characters)
@queue.unshift(*characters.to_a) unless characters == :EOF
end
end
# String-like object with an assosiated position and various extra methods
# If the position is ever greater than the string length then an exception is raised
class EncodingBytes < String
attr_accessor :position
def initialize(value)
super(value)
@position = -1
end
def each
while @position < length
@position += 1
yield self[@position]
end
rescue EOF
end
def current_byte
raise EOF if @position >= length
return self[@position].chr
end
# Skip past a list of characters
def skip(chars=SPACE_CHARACTERS)
while chars.include?(current_byte)
@position += 1
end
end
# Look for a sequence of bytes at the start of a string. If the bytes
# are found return true and advance the position to the byte after the
# match. Otherwise return false and leave the position alone
def match_bytes(bytes, lower=false)
data = self[position ... position+bytes.length]
data.downcase! if lower
rv = (data == bytes)
@position += bytes.length if rv == true
return rv
end
# Look for the next sequence of bytes matching a given sequence. If
# a match is found advance the position to the last byte of the match
def jump_to(bytes)
new_position = self[position .. -1].index(bytes)
if new_position
@position += (new_position + bytes.length-1)
return true
else
raise EOF
end
end
# Move the pointer so it points to the next byte in a set of possible
# bytes
def find_next(byte_list)
until byte_list.include?(current_byte)
@position += 1
end
end
end
# Mini parser for detecting character encoding from meta elements
class EncodingParser
# string - the data to work on for encoding detection
def initialize(data)
@data = EncodingBytes.new(data.to_s)
@encoding = nil
end
@@method_dispatch = [
['<!--', :handle_comment],
['<meta', :handle_meta],
['</', :handle_possible_end_tag],
['<!', :handle_other],
['<?', :handle_other],
['<', :handle_possible_start_tag]
]
def get_encoding
@data.each do |byte|
keep_parsing = true
@@method_dispatch.each do |(key, method)|
if @data.match_bytes(key, lower = true)
keep_parsing = send(method)
break
end
end
break unless keep_parsing
end
@encoding = @encoding.strip unless @encoding.nil?
return @encoding
end
# Skip over comments
def handle_comment
return @data.jump_to('-->')
end
def handle_meta
# if we have <meta not followed by a space so just keep going
return true unless SPACE_CHARACTERS.include?(@data.current_byte)
#We have a valid meta element we want to search for attributes
while true
#Try to find the next attribute after the current position
attr = get_attribute
return true if attr.nil?
if attr[0] == 'charset'
tentative_encoding = attr[1]
if HTML5.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding
return false
end
elsif attr[0] == 'content'
content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
tentative_encoding = content_parser.parse
if HTML5.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding
return false
end
end
end
end
def handle_possible_start_tag
return handle_possible_tag(false)
end
def handle_possible_end_tag
@data.position += 1
return handle_possible_tag(true)
end
def handle_possible_tag(end_tag)
unless ASCII_LETTERS.include?(@data.current_byte)
#If the next byte is not an ascii letter either ignore this
#fragment (possible start tag case) or treat it according to
#handleOther
if end_tag
@data.position -= 1
handle_other
end
return true
end
@data.find_next(SPACE_CHARACTERS + ['<', '>'])
if @data.current_byte == '<'
#return to the first step in the overall "two step" algorithm
#reprocessing the < byte
@data.position -= 1
else
#Read all attributes
{} until get_attribute.nil?
end
return true
end
def handle_other
return @data.jump_to('>')
end
# Return a name,value pair for the next attribute in the stream,
# if one is found, or nil
def get_attribute
@data.skip(SPACE_CHARACTERS + ['/'])
if @data.current_byte == '<'
@data.position -= 1
return nil
elsif @data.current_byte == '>'
return nil
end
attr_name = []
attr_value = []
space_found = false
#Step 5 attribute name
while true
if @data.current_byte == '=' and attr_name
break
elsif SPACE_CHARACTERS.include?(@data.current_byte)
space_found = true
break
elsif ['/', '<', '>'].include?(@data.current_byte)
return [attr_name.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_name.push(@data.current_byte.downcase)
else
attr_name.push(@data.current_byte)
end
#Step 6
@data.position += 1
end
#Step 7
if space_found
@data.skip
#Step 8
unless @data.current_byte == '='
@data.position -= 1
return [attr_name.join(''), '']
end
end
#XXX need to advance position in both spaces and value case
#Step 9
@data.position += 1
#Step 10
@data.skip
#Step 11
if ["'", '"'].include?(@data.current_byte)
#11.1
quote_char = @data.current_byte
while true
@data.position+=1
#11.3
if @data.current_byte == quote_char
@data.position += 1
return [attr_name.join(''), attr_value.join('')]
#11.4
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_value.push(@data.current_byte.downcase)
#11.5
else
attr_value.push(@data.current_byte)
end
end
elsif ['>', '<'].include?(@data.current_byte)
return [attr_name.join(''), '']
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_value.push(@data.current_byte.downcase)
else
attr_value.push(@data.current_byte)
end
while true
@data.position += 1
if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
return [attr_name.join(''), attr_value.join('')]
elsif ASCII_UPPERCASE.include?(@data.current_byte)
attr_value.push(@data.current_byte.downcase)
else
attr_value.push(@data.current_byte)
end
end
end
end
class ContentAttrParser
def initialize(data)
@data = data
end
def parse
begin
#Skip to the first ";"
@data.position = 0
@data.jump_to(';')
@data.position += 1
@data.skip
#Check if the attr name is charset
#otherwise return
@data.jump_to('charset')
@data.position += 1
@data.skip
unless @data.current_byte == '='
#If there is no = sign keep looking for attrs
return nil
end
@data.position += 1
@data.skip
#Look for an encoding between matching quote marks
if ['"', "'"].include?(@data.current_byte)
quote_mark = @data.current_byte
@data.position += 1
old_position = @data.position
@data.jump_to(quote_mark)
return @data[old_position ... @data.position]
else
#Unquoted value
old_position = @data.position
begin
@data.find_next(SPACE_CHARACTERS)
return @data[old_position ... @data.position]
rescue EOF
#Return the whole remaining value
return @data[old_position .. -1]
end
end
rescue EOF
return nil
end
end
end
# Determine if a string is a supported encoding
def self.is_valid_encoding(encoding)
(not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
end
end

View file

@ -0,0 +1,158 @@
# Warning: this module is experimental and subject to change and even removal
# at any time.
#
# For background/rationale, see:
# * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
# * http://tinyurl.com/ylfj8k (and follow-ups)
#
# References:
# * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
# * http://wiki.whatwg.org/wiki/HtmlVsXhtml
#
# @@TODO:
# * Selectively lowercase only XHTML, but not foreign markup
require 'html5/html5parser'
require 'html5/constants'
module HTML5
# liberal XML parser
class XMLParser < HTMLParser
def initialize(options = {})
super options
@phases[:initial] = XmlRootPhase.new(self, @tree)
end
def normalizeToken(token)
case token[:type]
when :StartTag, :EmptyTag
# We need to remove the duplicate attributes and convert attributes
# to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
token[:data] = Hash[*token[:data].reverse.flatten]
# For EmptyTags, process both a Start and an End tag
if token[:type] == :EmptyTag
save = @tokenizer.contentModelFlag
@phase.processStartTag(token[:name], token[:data])
@tokenizer.contentModelFlag = save
token[:data] = {}
token[:type] = :EndTag
end
when :Characters
# un-escape RCDATA_ELEMENTS (e.g. style, script)
if @tokenizer.contentModelFlag == :CDATA
token[:data] = token[:data].
gsub('&lt;','<').gsub('&gt;','>').gsub('&amp;','&')
end
when :EndTag
if token[:data]
parseError(_("End tag contains unexpected attributes."))
end
when :Comment
# Rescue CDATA from the comments
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
token[:type] = :Characters
token[:data] = token[:data][7 ... -2]
end
end
return token
end
end
# liberal XMTHML parser
class XHTMLParser < XMLParser
def initialize(options = {})
super options
@phases[:initial] = InitialPhase.new(self, @tree)
@phases[:rootElement] = XhmlRootPhase.new(self, @tree)
end
def normalizeToken(token)
super(token)
# ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted
if token[:type] == :EndTag
if VOID_ELEMENTS.include? token[:name]
if @tree.openElements[-1].name != token["name"]:
token[:type] = :EmptyTag
token["data"] ||= {}
end
else
if token[:name] == @tree.openElements[-1].name and \
not @tree.openElements[-1].hasContent
@tree.insertText('') unless
@tree.openElements.any? {|e|
e.attributes.keys.include? 'xmlns' and
e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
}
end
end
end
return token
end
end
class XhmlRootPhase < RootElementPhase
def insertHtmlElement
element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
@tree.openElements.push(element)
@tree.document.appendChild(element)
@parser.phase = @parser.phases[:beforeHead]
end
end
class XmlRootPhase < Phase
# Prime the Xml parser
@start_tag_handlers = Hash.new(:startTagOther)
@end_tag_handlers = Hash.new(:endTagOther)
def startTagOther(name, attributes)
@tree.openElements.push(@tree.document)
element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element)
@tree.openElements.push(element)
@parser.phase = XmlElementPhase.new(@parser,@tree)
end
def endTagOther(name)
super
@tree.openElements.pop
end
end
class XmlElementPhase < Phase
# Generic handling for all XML elements
@start_tag_handlers = Hash.new(:startTagOther)
@end_tag_handlers = Hash.new(:endTagOther)
def startTagOther(name, attributes)
element = @tree.createElement(name, attributes)
@tree.openElements[-1].appendChild(element)
@tree.openElements.push(element)
end
def endTagOther(name)
for node in @tree.openElements.reverse
if node.name == name
{} while @tree.openElements.pop != node
break
else
@parser.parseError
end
end
end
def processCharacters(data)
@tree.insertText(data)
end
end
end

View file

@ -0,0 +1,188 @@
require 'cgi'
require 'html5/tokenizer'
module HTML5
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes.
#
# It can be either at the Tokenizer stage:
#
# HTMLParser.parse(html, :tokenizer => HTMLSanitizer)
#
# or, if you already have a parse tree (in this example, a REXML tree),
# at the Serializer stage:
#
# tokens = TreeWalkers.getTreeWalker('rexml').new(tree)
# HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
# :sanitize => true})
module HTMLSanitizeModule
ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
button caption center cite code col colgroup dd del dfn dir div dl dt
em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
legend li map menu ol optgroup option p pre q s samp select small span
strike strong sub sup table tbody td textarea tfoot th thead tr tt u
ul var]
MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo
mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub
msubsup msup mtable mtd mtext mtr munder munderover none]
SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
circle defs desc ellipse font-face font-face-name font-face-src g
glyph hkern image linearGradient line marker metadata missing-glyph
mpath path polygon polyline radialGradient rect set stop svg switch
text title tspan use]
ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
align alt axis border cellpadding cellspacing char charoff charset
checked cite class clear cols colspan color compact coords datetime
dir disabled enctype for frame headers height href hreflang hspace id
ismap label lang longdesc maxlength media method multiple name nohref
noshade nowrap prompt readonly rel rev rows rowspan rules scope
selected shape size span src start style summary tabindex target title
type usemap valign value vspace width xml:lang]
MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign
columnalign columnlines columnspacing columnspan depth display
displaystyle equalcolumns equalrows fence fontstyle fontweight frame
height linethickness lspace mathbackground mathcolor mathvariant
mathvariant maxsize minsize other rowalign rowalign rowalign rowlines
rowspacing rowspan rspace scriptlevel selection separator stretchy
width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
arabic-form ascent attributeName attributeType baseProfile bbox begin
by calcMode cap-height class color color-rendering content cx cy d dx
dy descent display dur end fill fill-rule font-family font-size
font-stretch font-style font-variant font-weight from fx fy g1 g2
glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
ideographic k keyPoints keySplines keyTimes lang marker-end
marker-mid marker-start markerHeight markerUnits markerWidth
mathematical max min name offset opacity orient origin
overline-position overline-thickness panose-1 path pathLength points
preserveAspectRatio r refX refY repeatCount repeatDur
requiredExtensions requiredFeatures restart rotate rx ry slope stemh
stemv stop-color stop-opacity strikethrough-position
strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
stroke-width systemLanguage target text-anchor to transform type u1
u2 underline-position underline-thickness unicode unicode-range
units-per-em values version viewBox visibility width widths x
x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
xmlns:xlink y y1 y2 zoomAndPan]
ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href xml:base]
ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
border-bottom-color border-collapse border-color border-left-color
border-right-color border-top-color clear color cursor direction
display elevation float font font-family font-size font-style
font-variant font-weight height letter-spacing line-height overflow
pause pause-after pause-before pitch pitch-range richness speak
speak-header speak-numeral speak-punctuation speech-rate stress
text-align text-decoration text-indent unicode-bidi vertical-align
voice-family volume white-space width]
ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
brown center collapse dashed dotted fuchsia gray green !important
italic left lime maroon medium none navy normal nowrap olive pointer
purple red right solid silver teal top transparent underline white
yellow]
ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
stroke-width stroke-linecap stroke-linejoin stroke-opacity]
ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
# subclasses may define their own versions of these constants
ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
def sanitize_token(token)
case token[:type]
when :StartTag, :EndTag, :EmptyTag
if ALLOWED_ELEMENTS.include?(token[:name])
if token.has_key? :data
attrs = Hash[*token[:data].flatten]
attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
ATTR_VAL_IS_URI.each do |attr|
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
attrs.delete attr
end
end
if attrs['style']
attrs['style'] = sanitize_css(attrs['style'])
end
token[:data] = attrs.map {|k,v| [k,v]}
end
return token
else
if token[:type] == :EndTag
token[:data] = "</#{token[:name]}>"
elsif token[:data]
attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
token[:data] = "<#{token[:name]}#{attrs}>"
else
token[:data] = "<#{token[:name]}>"
end
token[:data].insert(-2,'/') if token[:type] == :EmptyTag
token[:type] = :Characters
token.delete(:name)
return token
end
when :Comment
token[:data] = ""
return token
else
return token
end
end
def sanitize_css(style)
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
# gauntlet
return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
next if val.empty?
prop.downcase!
if ALLOWED_CSS_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
elsif %w[background border margin padding].include?(prop.split('-')[0])
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
!ALLOWED_CSS_KEYWORDS.include?(keyword) and
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
end
elsif ALLOWED_SVG_PROPERTIES.include?(prop)
clean << "#{prop}: #{val};"
end
end
style = clean.join(' ')
end
end
class HTMLSanitizer < HTMLTokenizer
include HTMLSanitizeModule
def each
super do |token|
yield(sanitize_token(token))
end
end
end
end

View file

@ -0,0 +1,2 @@
require 'html5/serializer/htmlserializer'
require 'html5/serializer/xhtmlserializer'

View file

@ -0,0 +1,178 @@
require 'html5/constants'
module HTML5
class HTMLSerializer
def self.serialize(stream, options = {})
new(options).serialize(stream, options[:encoding])
end
def escape(string)
string.gsub("&", "&amp;").gsub("<", "&lt;").gsub(">", "&gt;")
end
def initialize(options={})
@quote_attr_values = false
@quote_char = '"'
@use_best_quote_char = true
@minimize_boolean_attributes = true
@use_trailing_solidus = false
@space_before_trailing_solidus = true
@escape_lt_in_attrs = false
@escape_rcdata = false
@omit_optional_tags = true
@sanitize = false
@strip_whitespace = false
@inject_meta_charset = true
options.each do |name, value|
next unless instance_variables.include?("@#{name}")
@use_best_quote_char = false if name.to_s == 'quote_char'
instance_variable_set("@#{name}", value)
end
@errors = []
end
def serialize(treewalker, encoding=nil)
in_cdata = false
@errors = []
if encoding and @inject_meta_charset
require 'html5/filters/inject_meta_charset'
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
end
if @strip_whitespace
require 'html5/filters/whitespace'
treewalker = Filters::WhitespaceFilter.new(treewalker)
end
if @sanitize
require 'html5/filters/sanitizer'
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
end
if @omit_optional_tags
require 'html5/filters/optionaltags'
treewalker = Filters::OptionalTagFilter.new(treewalker)
end
result = []
treewalker.each do |token|
type = token[:type]
if type == :Doctype
doctype = "<!DOCTYPE %s>" % token[:name]
result << doctype
elsif [:Characters, :SpaceCharacters].include? type
if type == :SpaceCharacters or in_cdata
if in_cdata and token[:data].include?("</")
serializeError(_("Unexpected </ in CDATA"))
end
result << token[:data]
else
result << escape(token[:data])
end
elsif [:StartTag, :EmptyTag].include? type
name = token[:name]
if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata
in_cdata = true
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
attributes = []
for k,v in attrs = token[:data].to_a.sort
attributes << ' '
attributes << k
if not @minimize_boolean_attributes or \
(!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
and !BOOLEAN_ATTRIBUTES[:global].include?(k))
attributes << "="
if @quote_attr_values or v.empty?
quote_attr = true
else
quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
end
v = v.gsub("&", "&amp;")
v = v.gsub("<", "&lt;") if @escape_lt_in_attrs
if quote_attr
quote_char = @quote_char
if @use_best_quote_char
if v.index("'") and !v.index('"')
quote_char = '"'
elsif v.index('"') and !v.index("'")
quote_char = "'"
end
end
if quote_char == "'"
v = v.gsub("'", "&#39;")
else
v = v.gsub('"', "&quot;")
end
attributes << quote_char << v << quote_char
else
attributes << v
end
end
end
if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
if @space_before_trailing_solidus
attributes << " /"
else
attributes << "/"
end
end
result << "<%s%s>" % [name, attributes.join('')]
elsif type == :EndTag
name = token[:name]
if RCDATA_ELEMENTS.include?(name)
in_cdata = false
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))
end
end_tag = "</#{name}>"
result << end_tag
elsif type == :Comment
data = token[:data]
serializeError(_("Comment contains --")) if data.index("--")
comment = "<!--%s-->" % token[:data]
result << comment
else
serializeError(token[:data])
end
end
if encoding and encoding != 'utf-8'
require 'iconv'
Iconv.iconv(encoding, 'utf-8', result.join('')).first
else
result.join('')
end
end
alias :render :serialize
def serializeError(data="XXX ERROR MESSAGE NEEDED")
# XXX The idea is to make data mandatory.
@errors.push(data)
if @strict
raise SerializeError
end
end
end
# Error in serialized tree
class SerializeError < Exception
end
end

View file

@ -0,0 +1,20 @@
require 'html5/serializer/htmlserializer'
module HTML5
class XHTMLSerializer < HTMLSerializer
DEFAULTS = {
:quote_attr_values => true,
:minimize_boolean_attributes => false,
:use_trailing_solidus => true,
:escape_lt_in_attrs => true,
:omit_optional_tags => false,
:escape_rcdata => true
}
def initialize(options={})
super(DEFAULTS.clone.update(options))
end
end
end

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,24 @@
module HTML5
module TreeBuilders
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5/treebuilders/simpletree'
SimpleTree::TreeBuilder
when 'rexml' then
require 'html5/treebuilders/rexml'
REXML::TreeBuilder
when 'hpricot' then
require 'html5/treebuilders/hpricot'
Hpricot::TreeBuilder
else
raise "Unknown TreeBuilder #{name}"
end
end
alias :getTreeBuilder :[]
end
end
end

View file

@ -0,0 +1,330 @@
require 'html5/constants'
#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
module HTML5
# The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, buttons, object elements, and marquees.
Marker = nil
module TreeBuilders
module Base
class Node
# The parent of the current node (or nil for the document node)
attr_accessor :parent
# a list of child nodes of the current node. This must
# include all elements but not necessarily other node types
attr_accessor :childNodes
# A list of miscellaneous flags that can be set on the node
attr_accessor :_flags
def initialize(name)
@parent = nil
@childNodes = []
@_flags = []
end
# Insert node as a child of the current node
def appendChild(node)
raise NotImplementedError
end
# Insert data as text in the current node, positioned before the
# start of node insertBefore or to the end of the node's text.
def insertText(data, insertBefore=nil)
raise NotImplementedError
end
# Insert node as a child of the current node, before refNode in the
# list of child nodes. Raises ValueError if refNode is not a child of
# the current node
def insertBefore(node, refNode)
raise NotImplementedError
end
# Remove node from the children of the current node
def removeChild(node)
raise NotImplementedError
end
# Move all the children of the current node to newParent.
# This is needed so that trees that don't store text as nodes move the
# text in the correct way
def reparentChildren(newParent)
#XXX - should this method be made more general?
@childNodes.each { |child| newParent.appendChild(child) }
@childNodes = []
end
# Return a shallow copy of the current node i.e. a node with the same
# name and attributes but with no parent or child nodes
def cloneNode
raise NotImplementedError
end
# Return true if the node has children or text, false otherwise
def hasContent
raise NotImplementedError
end
end
# Base treebuilder implementation
class TreeBuilder
attr_accessor :openElements
attr_accessor :activeFormattingElements
attr_accessor :document
attr_accessor :headPointer
attr_accessor :formPointer
# Class to use for document root
documentClass = nil
# Class to use for HTML elements
elementClass = nil
# Class to use for comments
commentClass = nil
# Class to use for doctypes
doctypeClass = nil
# Fragment class
fragmentClass = nil
def initialize
reset
end
def reset
@openElements = []
@activeFormattingElements = []
#XXX - rename these to headElement, formElement
@headPointer = nil
@formPointer = nil
self.insertFromTable = false
@document = @documentClass.new
end
def elementInScope(target, tableVariant=false)
# Exit early when possible.
return true if @openElements[-1].name == target
# AT How about while true and simply set node to [-1] and set it to
# [-2] at the end...
@openElements.reverse.each do |element|
if element.name == target
return true
elsif element.name == 'table'
return false
elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
return false
elsif element.name == 'html'
return false
end
end
assert false # We should never reach this point
end
def reconstructActiveFormattingElements
# Within this algorithm the order of steps described in the
# specification is not quite the same as the order of steps in the
# code. It should still do the same though.
# Step 1: stop the algorithm when there's nothing to do.
return if @activeFormattingElements.empty?
# Step 2 and step 3: we start with the last element. So i is -1.
i = -1
entry = @activeFormattingElements[i]
return if entry == Marker or @openElements.include?(entry)
# Step 6
until entry == Marker or @openElements.include?(entry)
# Step 5: let entry be one earlier in the list.
i -= 1
begin
entry = @activeFormattingElements[i]
rescue
# Step 4: at this point we need to jump to step 8. By not doing
# i += 1 which is also done in step 7 we achieve that.
break
end
end
while true
# Step 7
i += 1
# Step 8
clone = @activeFormattingElements[i].cloneNode
# Step 9
element = insertElement(clone.name, clone.attributes)
# Step 10
@activeFormattingElements[i] = element
# Step 11
break if element == @activeFormattingElements[-1]
end
end
def clearActiveFormattingElements
{} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
end
# Check if an element exists between the end of the active
# formatting elements and the last marker. If it does, return it, else
# return false
def elementInActiveFormattingElements(name)
@activeFormattingElements.reverse.each do |element|
# Check for Marker first because if it's a Marker it doesn't have a
# name attribute.
break if element == Marker
return element if element.name == name
end
return false
end
def insertDoctype(name)
@document.appendChild(@doctypeClass.new(name))
end
def insertComment(data, parent=nil)
parent = @openElements[-1] if parent.nil?
parent.appendChild(@commentClass.new(data))
end
# Create an element but don't insert it anywhere
def createElement(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
return element
end
# Switch the function used to insert an element from the
# normal one to the misnested table one and back again
def insertFromTable=(value)
@insertFromTable = value
@insertElement = value ? :insertElementTable : :insertElementNormal
end
def insertElement(name, attributes)
send(@insertElement, name, attributes)
end
def insertElementNormal(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
@openElements[-1].appendChild(element)
@openElements.push(element)
return element
end
# Create an element and insert it into the tree
def insertElementTable(name, attributes)
element = @elementClass.new(name)
element.attributes = attributes
if TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)
#We should be in the InTable mode. This means we want to do
#special magic element rearranging
parent, insertBefore = getTableMisnestedNodePosition
if insertBefore.nil?
parent.appendChild(element)
else
parent.insertBefore(element, insertBefore)
end
@openElements.push(element)
else
return insertElementNormal(name, attributes)
end
return element
end
def insertText(data, parent=nil)
parent = @openElements[-1] if parent.nil?
if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)))
parent.insertText(data)
else
#We should be in the InTable mode. This means we want to do
#special magic element rearranging
parent, insertBefore = getTableMisnestedNodePosition
parent.insertText(data, insertBefore)
end
end
# Get the foster parent element, and sibling to insert before
# (or nil) when inserting a misnested table node
def getTableMisnestedNodePosition
#The foster parent element is the one which comes before the most
#recently opened table element
#XXX - this is really inelegant
lastTable = nil
fosterParent = nil
insertBefore = nil
@openElements.reverse.each do |element|
if element.name == "table"
lastTable = element
break
end
end
if lastTable
#XXX - we should really check that this parent is actually a
#node here
if lastTable.parent
fosterParent = lastTable.parent
insertBefore = lastTable
else
fosterParent = @openElements[@openElements.index(lastTable) - 1]
end
else
fosterParent = @openElements[0]
end
return fosterParent, insertBefore
end
def generateImpliedEndTags(exclude=nil)
name = @openElements[-1].name
if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude)
@openElements.pop
# XXX This is not entirely what the specification says. We should
# investigate it more closely.
generateImpliedEndTags(exclude)
end
end
def getDocument
@document
end
def getFragment
#assert @innerHTML
fragment = @fragmentClass.new
@openElements[0].reparentChildren(fragment)
return fragment
end
# Serialize the subtree of node in the format required by unit tests
# node - the node from which to start serializing
def testSerializer(node)
raise NotImplementedError
end
end
end
end
end

View file

@ -0,0 +1,221 @@
require 'html5/treebuilders/base'
require 'rubygems'
require 'hpricot'
require 'forwardable'
module HTML5
module TreeBuilders
module Hpricot
class Node < Base::Node
extend Forwardable
def_delegators :@hpricot, :name
attr_accessor :hpricot
def initialize(name)
super(name)
@hpricot = self.class.hpricot_class.new name
end
def appendChild(node)
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
else
childNodes << node
hpricot.children << node.hpricot
end
if (oldparent = node.hpricot.parent) != nil
oldparent.children.delete_at(oldparent.children.index(node.hpricot))
end
node.hpricot.parent = hpricot
node.parent = self
end
def removeChild(node)
childNodes.delete(node)
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
node.hpricot.parent = nil
node.parent = nil
end
def insertText(data, before=nil)
if before
insertBefore(TextNode.new(data), before)
else
appendChild(TextNode.new(data))
end
end
def insertBefore(node, refNode)
index = childNodes.index(refNode)
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
else
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
childNodes.insert(index, node)
end
end
def hasContent
childNodes.any?
end
end
class Element < Node
def self.hpricot_class
::Hpricot::Elem
end
def initialize(name)
super(name)
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
end
def name
@hpricot.stag.name
end
def cloneNode
attributes.inject(self.class.new(name)) do |node, (name, value)|
node.hpricot[name] = value
node
end
end
# A call to Hpricot::Elem#raw_attributes is built dynamically,
# so alterations to the returned value (a hash) will be lost.
#
# AttributeProxy works around this by forwarding :[]= calls
# to the raw_attributes accessor on the element start tag.
#
class AttributeProxy
def initialize(hpricot)
@hpricot = hpricot
end
def []=(k, v)
@hpricot.stag.send(stag_attributes_method)[k] = v
end
def stag_attributes_method
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
end
def method_missing(*a, &b)
@hpricot.attributes.send(*a, &b)
end
end
def attributes
AttributeProxy.new(@hpricot)
end
def attributes=(attrs)
attrs.each { |name, value| @hpricot[name] = value }
end
def printTree(indent=0)
tree = "\n|#{' ' * indent}<#{name}>"
indent += 2
attributes.each do |name, value|
next if name == 'xmlns'
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
end
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
end
end
class Document < Node
def self.hpricot_class
::Hpricot::Doc
end
def initialize
super(nil)
end
def printTree(indent=0)
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
end
end
class DocumentType < Node
def self.hpricot_class
::Hpricot::DocType
end
def initialize(name)
begin
super(name)
rescue ArgumentError # needs 3...
end
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
end
def printTree(indent=0)
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
end
end
class DocumentFragment < Element
def initialize
super('')
end
def printTree(indent=0)
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
end
end
class TextNode < Node
def initialize(data)
@hpricot = ::Hpricot::Text.new(data)
end
def printTree(indent=0)
"\n|#{' ' * indent}\"#{hpricot.content}\""
end
end
class CommentNode < Node
def self.hpricot_class
::Hpricot::Comment
end
def printTree(indent=0)
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer(node)
node.printTree
end
def getDocument
@document.hpricot
end
def getFragment
@document = super
return @document.hpricot.children
end
end
end
end
end

View file

@ -0,0 +1,192 @@
require 'html5/treebuilders/base'
require 'rexml/document'
require 'forwardable'
module HTML5
module TreeBuilders
module REXML
class Node < Base::Node
extend Forwardable
def_delegators :@rxobj, :name, :attributes
attr_accessor :rxobj
def initialize name
super name
@rxobj = self.class.rxclass.new name
end
def appendChild node
if node.kind_of? TextNode and
childNodes.length>0 and childNodes[-1].kind_of? TextNode
childNodes[-1].rxobj.value =
childNodes[-1].rxobj.to_s + node.rxobj.to_s
childNodes[-1].rxobj.raw = true
else
childNodes.push node
rxobj.add node.rxobj
end
node.parent = self
end
def removeChild node
childNodes.delete node
rxobj.delete node.rxobj
node.parent = nil
end
def insertText data, before=nil
if before
insertBefore TextNode.new(data), before
else
appendChild TextNode.new(data)
end
end
def insertBefore node, refNode
index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and
childNodes[index-1].kind_of? TextNode
childNodes[index-1].rxobj.value =
childNodes[index-1].rxobj.to_s + node.rxobj.to_s
childNodes[index-1].rxobj.raw = true
else
childNodes.insert index, node
refNode.rxobj.parent.insert_before(refNode.rxobj,node.rxobj)
end
end
def hasContent
return (childNodes.length > 0)
end
end
class Element < Node
def self.rxclass
::REXML::Element
end
def initialize name
super name
end
def cloneNode
newNode = self.class.new name
attributes.each {|name,value| newNode.attributes[name] = value}
newNode
end
def attributes= value
value.each {|name, value| rxobj.attributes[name]=value}
end
def printTree indent=0
tree = "\n|#{' ' * indent}<#{name}>"
indent += 2
for name, value in attributes
next if name == 'xmlns'
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
end
for child in childNodes
tree += child.printTree(indent)
end
return tree
end
end
class Document < Node
def self.rxclass
::REXML::Document
end
def initialize
super nil
end
def appendChild node
if node.kind_of? Element and node.name == 'html'
node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
end
super node
end
def printTree indent=0
tree = "#document"
for child in childNodes
tree += child.printTree(indent + 2)
end
return tree
end
end
class DocumentType < Node
def self.rxclass
::REXML::DocType
end
def printTree indent=0
"\n|#{' ' * indent}<!DOCTYPE #{name}>"
end
end
class DocumentFragment < Element
def initialize
super nil
end
def printTree indent=0
tree = ""
for child in childNodes
tree += child.printTree(indent+2)
end
return tree
end
end
class TextNode < Node
def initialize data
raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
@rxobj = ::REXML::Text.new(raw, true, nil, true)
end
def printTree indent=0
"\n|#{' ' * indent}\"#{rxobj.value}\""
end
end
class CommentNode < Node
def self.rxclass
::REXML::Comment
end
def printTree indent=0
"\n|#{' ' * indent}<!-- #{rxobj.string} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
end
def getDocument
@document.rxobj
end
def getFragment
@document = super
return @document.rxobj.children
end
end
end
end
end

View file

@ -0,0 +1,178 @@
require 'html5/treebuilders/base'
module HTML5
module TreeBuilders
module SimpleTree
class Node < Base::Node
# Node representing an item in the tree.
# name - The tag name associated with the node
attr_accessor :name
# The value of the current node (applies to text nodes and
# comments
attr_accessor :value
# a dict holding name, value pairs for attributes of the node
attr_accessor :attributes
def initialize name
super
@name = name
@value = nil
@attributes = {}
end
def appendChild node
if node.kind_of? TextNode and
childNodes.length>0 and childNodes[-1].kind_of? TextNode
childNodes[-1].value += node.value
else
childNodes.push node
end
node.parent = self
end
def removeChild node
childNodes.delete node
node.parent = nil
end
def cloneNode
newNode = self.class.new name
attributes.each {|name,value| newNode.attributes[name] = value}
newNode.value = value
newNode
end
def insertText data, before=nil
if before
insertBefore TextNode.new(data), before
else
appendChild TextNode.new(data)
end
end
def insertBefore node, refNode
index = childNodes.index(refNode)
if node.kind_of? TextNode and index>0 and
childNodes[index-1].kind_of? TextNode
childNodes[index-1].value += node.value
else
childNodes.insert index, node
end
end
def printTree indent=0
tree = "\n|%s%s" % [' '* indent, self.to_s]
for child in childNodes
tree += child.printTree(indent + 2)
end
return tree
end
def hasContent
return (childNodes.length > 0)
end
end
class Element < Node
def to_s
"<#{name}>"
end
def printTree indent=0
tree = "\n|%s%s" % [' '* indent, self.to_s]
indent += 2
for name, value in attributes
tree += "\n|%s%s=\"%s\"" % [' ' * indent, name, value]
end
for child in childNodes
tree += child.printTree(indent)
end
return tree
end
end
class Document < Node
def to_s
"#document"
end
def initialize
super nil
end
def printTree indent=0
tree = to_s
for child in childNodes
tree += child.printTree(indent + 2)
end
return tree
end
end
class DocumentType < Node
def to_s
"<!DOCTYPE %s>" % name
end
end
class DocumentFragment < Element
def initialize
super nil
end
def printTree indent=0
tree = ""
for child in childNodes
tree += child.printTree(indent+2)
end
return tree
end
end
class TextNode < Node
def initialize value
super nil
@value = value
end
def to_s
'"%s"' % value
end
end
class CommentNode < Node
def initialize value
super nil
@value = value
end
def to_s
"<!-- %s -->" % value
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer node
node.printTree()
end
def getFragment
@document = super
return @document.childNodes
end
end
end
end
end

View file

@ -0,0 +1,26 @@
require 'html5/treewalkers/base'
module HTML5
module TreeWalkers
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5/treewalkers/simpletree'
SimpleTree::TreeWalker
when 'rexml' then
require 'html5/treewalkers/rexml'
REXML::TreeWalker
when 'hpricot' then
require 'html5/treewalkers/hpricot'
Hpricot::TreeWalker
else
raise "Unknown TreeWalker #{name}"
end
end
alias :getTreeWalker :[]
end
end
end

View file

@ -0,0 +1,156 @@
require 'html5/constants'
module HTML5
module TreeWalkers
module TokenConstructor
def error(msg)
return {:type => "SerializeError", :data => msg}
end
def normalizeAttrs(attrs)
attrs.to_a
end
def emptyTag(name, attrs, hasChildren=false)
error(_("Void element has children")) if hasChildren
return({:type => :EmptyTag, :name => name, \
:data => normalizeAttrs(attrs)})
end
def startTag(name, attrs)
return {:type => :StartTag, :name => name, \
:data => normalizeAttrs(attrs)}
end
def endTag(name)
return {:type => :EndTag, :name => name, :data => []}
end
def text(data)
if data =~ /\A([#{SPACE_CHARACTERS.join('')}]+)/m
yield({:type => :SpaceCharacters, :data => $1})
data = data[$1.length .. -1]
return if data.empty?
end
if data =~ /([#{SPACE_CHARACTERS.join('')}]+)\Z/m
yield({:type => :Characters, :data => data[0 ... -$1.length]})
yield({:type => :SpaceCharacters, :data => $1})
else
yield({:type => :Characters, :data => data})
end
end
def comment(data)
return {:type => :Comment, :data => data}
end
def doctype(name)
return {:type => :Doctype, :name => name, :data => name.upcase() == "HTML"}
end
def unknown(nodeType)
return error(_("Unknown node type: ") + nodeType.to_s)
end
def _(str)
str
end
end
class Base
include TokenConstructor
def initialize(tree)
@tree = tree
end
def each
raise NotImplementedError
end
alias walk each
end
class NonRecursiveTreeWalker < TreeWalkers::Base
def node_details(node)
raise NotImplementedError
end
def first_child(node)
raise NotImplementedError
end
def next_sibling(node)
raise NotImplementedError
end
def parent(node)
raise NotImplementedError
end
def each
currentNode = @tree
while currentNode != nil
details = node_details(currentNode)
hasChildren = false
case details.shift
when :DOCTYPE
yield doctype(*details)
when :TEXT
text(*details) {|token| yield token}
when :ELEMENT
name, attributes, hasChildren = details
if VOID_ELEMENTS.include?(name)
yield emptyTag(name, attributes.to_a, hasChildren)
hasChildren = false
else
yield startTag(name, attributes.to_a)
end
when :COMMENT
yield comment(details[0])
when :DOCUMENT, :DOCUMENT_FRAGMENT
hasChildren = true
when nil
# ignore (REXML::XMLDecl is an example)
else
yield unknown(details[0])
end
firstChild = hasChildren ? first_child(currentNode) : nil
if firstChild != nil
currentNode = firstChild
else
while currentNode != nil
details = node_details(currentNode)
if details.shift == :ELEMENT
name, attributes, hasChildren = details
yield endTag(name) if !VOID_ELEMENTS.include?(name)
end
if @tree == currentNode
currentNode = nil
else
nextSibling = next_sibling(currentNode)
if nextSibling != nil
currentNode = nextSibling
break
end
currentNode = parent(currentNode)
end
end
end
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5/treewalkers/base'
require 'rexml/document'
module HTML5
module TreeWalkers
module Hpricot
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node
when ::Hpricot::Elem
if node.name.empty?
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value]},
!node.empty?]
end
when ::Hpricot::Text
[:TEXT, node.to_plain_text]
when ::Hpricot::Comment
[:COMMENT, node.content]
when ::Hpricot::Doc
[:DOCUMENT]
when ::Hpricot::DocType
[:DOCTYPE, node.target]
when ::Hpricot::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end
end
def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_node
end
def parent(node)
node.parent
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5/treewalkers/base'
require 'rexml/document'
module HTML5
module TreeWalkers
module REXML
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node
when ::REXML::Document
[:DOCUMENT]
when ::REXML::Element
if !node.name
[:DOCUMENT_FRAGMENT]
else
[:ELEMENT, node.name,
node.attributes.map {|name,value| [name,value]},
node.has_elements? || node.has_text?]
end
when ::REXML::Text
[:TEXT, node.value]
when ::REXML::Comment
[:COMMENT, node.string]
when ::REXML::DocType
[:DOCTYPE, node.name]
when ::REXML::XMLDecl
[nil]
else
[:UNKNOWN, node.class.inspect]
end
end
def first_child(node)
node.children.first
end
def next_sibling(node)
node.next_sibling
end
def parent(node)
node.parent
end
end
end
end
end

View file

@ -0,0 +1,48 @@
require 'html5/treewalkers/base'
module HTML5
module TreeWalkers
module SimpleTree
class TreeWalker < HTML5::TreeWalkers::Base
include HTML5::TreeBuilders::SimpleTree
def walk(node)
case node
when Document, DocumentFragment
return
when DocumentType
yield doctype(node.name)
when TextNode
text(node.value) {|token| yield token}
when Element
if VOID_ELEMENTS.include?(node.name)
yield emptyTag(node.name, node.attributes, node.hasContent())
else
yield startTag(node.name, node.attributes)
for child in node.childNodes
walk(child) {|token| yield token}
end
yield endTag(node.name)
end
when CommentNode
yield comment(node.value)
else
puts '?'
yield unknown(node.class)
end
end
def each
for child in @tree.childNodes
walk(child) {|node| yield node}
end
end
end
end
end
end