Sync with latest HTML5lib and latest Maruku

This commit is contained in:
Jacques Distler 2007-07-04 17:36:59 -05:00
parent 8e92e4a3ab
commit 8ccaad85a5
71 changed files with 1974 additions and 1621 deletions

View file

@ -25,14 +25,14 @@
module Sanitize
require 'html5lib/html5parser'
require 'html5lib/liberalxmlparser'
require 'html5lib/treewalkers'
require 'html5lib/treebuilders'
require 'html5lib/serializer'
require 'html5lib/sanitizer'
require 'html5/html5parser'
require 'html5/liberalxmlparser'
require 'html5/treewalkers'
require 'html5/treebuilders'
require 'html5/serializer'
require 'html5/sanitizer'
include HTML5lib
include HTML5
# Sanitize a string, parsed using XHTML parsing rules.
#

View file

@ -1,11 +1,11 @@
require 'html5lib/html5parser'
module HTML5lib
def self.parse(stream, options={})
HTMLParser.parse(stream, options)
end
def self.parseFragment(stream, options={})
HTMLParser.parse(stream, options)
end
end
require 'html5/html5parser'
module HTML5
def self.parse(stream, options={})
HTMLParser.parse(stream, options)
end
def self.parseFragment(stream, options={})
HTMLParser.parse(stream, options)
end
end

817
vendor/plugins/HTML5lib/lib/html5/constants.rb vendored Executable file
View file

@ -0,0 +1,817 @@
module HTML5
class EOF < Exception; end
CONTENT_MODEL_FLAGS = [
:PCDATA,
:RCDATA,
:CDATA,
:PLAINTEXT
]
SCOPING_ELEMENTS = %w[
button
caption
html
marquee
object
table
td
th
]
FORMATTING_ELEMENTS = %w[
a
b
big
em
font
i
nobr
s
small
strike
strong
tt
u
]
SPECIAL_ELEMENTS = %w[
address
area
base
basefont
bgsound
blockquote
body
br
center
col
colgroup
dd
dir
div
dl
dt
embed
fieldset
form
frame
frameset
h1
h2
h3
h4
h5
h6
head
hr
iframe
image
img
input
isindex
li
link
listing
menu
meta
noembed
noframes
noscript
ol
optgroup
option
p
param
plaintext
pre
script
select
spacer
style
tbody
textarea
tfoot
thead
title
tr
ul
wbr
]
SPACE_CHARACTERS = %W[
\t
\n
\x0B
\x0C
\x20
\r
]
TABLE_INSERT_MODE_ELEMENTS = %w[
table
tbody
tfoot
thead
tr
]
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
DIGITS = '0'..'9'
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
# Heading elements need to be ordered
HEADING_ELEMENTS = %w[
h1
h2
h3
h4
h5
h6
]
# XXX What about event-source and command?
VOID_ELEMENTS = %w[
base
link
meta
hr
br
img
embed
param
area
col
input
]
CDATA_ELEMENTS = %w[title textarea]
RCDATA_ELEMENTS = %w[
style
script
xmp
iframe
noembed
noframes
noscript
]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],
'img' => %w[ismap],
'audio' => %w[autoplay controls],
'video' => %w[autoplay controls],
'script' => %w[defer async],
'details' => %w[open],
'datagrid' => %w[multiple disabled],
'command' => %w[hidden disabled checked default],
'menu' => %w[autosubmit],
'fieldset' => %w[disabled readonly],
'option' => %w[disabled readonly selected],
'optgroup' => %w[disabled readonly],
'button' => %w[disabled autofocus],
'input' => %w[disabled readonly required autofocus checked ismap],
'select' => %w[disabled readonly autofocus multiple],
'output' => %w[disabled readonly]
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN
65533, # 0x81 UNDEFINED
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
8224, # 0x86 0x2020 DAGGER
8225, # 0x87 0x2021 DOUBLE DAGGER
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
8240, # 0x89 0x2030 PER MILLE SIGN
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
65533, # 0x8D UNDEFINED
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
65533, # 0x8F UNDEFINED
65533, # 0x90 UNDEFINED
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
8226, # 0x95 0x2022 BULLET
8211, # 0x96 0x2013 EN DASH
8212, # 0x97 0x2014 EM DASH
732, # 0x98 0x02DC SMALL TILDE
8482, # 0x99 0x2122 TRADE MARK SIGN
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
65533, # 0x9D UNDEFINED
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
]
# ENTITIES was generated from Python using the following code:
#
# import constants
# entities = constants.entities.items()
# entities.sort()
# list = [ ' '.join([repr(entity), '=>', ord(value)<128 and
# repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
# for entity, value in entities]
# print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }'
ENTITIES = {
'AElig' => "\xc3\x86",
'AElig;' => "\xc3\x86",
'AMP' => '&',
'AMP;' => '&',
'Aacute' => "\xc3\x81",
'Aacute;' => "\xc3\x81",
'Acirc' => "\xc3\x82",
'Acirc;' => "\xc3\x82",
'Agrave' => "\xc3\x80",
'Agrave;' => "\xc3\x80",
'Alpha;' => "\xce\x91",
'Aring' => "\xc3\x85",
'Aring;' => "\xc3\x85",
'Atilde' => "\xc3\x83",
'Atilde;' => "\xc3\x83",
'Auml' => "\xc3\x84",
'Auml;' => "\xc3\x84",
'Beta;' => "\xce\x92",
'COPY' => "\xc2\xa9",
'COPY;' => "\xc2\xa9",
'Ccedil' => "\xc3\x87",
'Ccedil;' => "\xc3\x87",
'Chi;' => "\xce\xa7",
'Dagger;' => "\xe2\x80\xa1",
'Delta;' => "\xce\x94",
'ETH' => "\xc3\x90",
'ETH;' => "\xc3\x90",
'Eacute' => "\xc3\x89",
'Eacute;' => "\xc3\x89",
'Ecirc' => "\xc3\x8a",
'Ecirc;' => "\xc3\x8a",
'Egrave' => "\xc3\x88",
'Egrave;' => "\xc3\x88",
'Epsilon;' => "\xce\x95",
'Eta;' => "\xce\x97",
'Euml' => "\xc3\x8b",
'Euml;' => "\xc3\x8b",
'GT' => '>',
'GT;' => '>',
'Gamma;' => "\xce\x93",
'Iacute' => "\xc3\x8d",
'Iacute;' => "\xc3\x8d",
'Icirc' => "\xc3\x8e",
'Icirc;' => "\xc3\x8e",
'Igrave' => "\xc3\x8c",
'Igrave;' => "\xc3\x8c",
'Iota;' => "\xce\x99",
'Iuml' => "\xc3\x8f",
'Iuml;' => "\xc3\x8f",
'Kappa;' => "\xce\x9a",
'LT' => '<',
'LT;' => '<',
'Lambda;' => "\xce\x9b",
'Mu;' => "\xce\x9c",
'Ntilde' => "\xc3\x91",
'Ntilde;' => "\xc3\x91",
'Nu;' => "\xce\x9d",
'OElig;' => "\xc5\x92",
'Oacute' => "\xc3\x93",
'Oacute;' => "\xc3\x93",
'Ocirc' => "\xc3\x94",
'Ocirc;' => "\xc3\x94",
'Ograve' => "\xc3\x92",
'Ograve;' => "\xc3\x92",
'Omega;' => "\xce\xa9",
'Omicron;' => "\xce\x9f",
'Oslash' => "\xc3\x98",
'Oslash;' => "\xc3\x98",
'Otilde' => "\xc3\x95",
'Otilde;' => "\xc3\x95",
'Ouml' => "\xc3\x96",
'Ouml;' => "\xc3\x96",
'Phi;' => "\xce\xa6",
'Pi;' => "\xce\xa0",
'Prime;' => "\xe2\x80\xb3",
'Psi;' => "\xce\xa8",
'QUOT' => '"',
'QUOT;' => '"',
'REG' => "\xc2\xae",
'REG;' => "\xc2\xae",
'Rho;' => "\xce\xa1",
'Scaron;' => "\xc5\xa0",
'Sigma;' => "\xce\xa3",
'THORN' => "\xc3\x9e",
'THORN;' => "\xc3\x9e",
'TRADE;' => "\xe2\x84\xa2",
'Tau;' => "\xce\xa4",
'Theta;' => "\xce\x98",
'Uacute' => "\xc3\x9a",
'Uacute;' => "\xc3\x9a",
'Ucirc' => "\xc3\x9b",
'Ucirc;' => "\xc3\x9b",
'Ugrave' => "\xc3\x99",
'Ugrave;' => "\xc3\x99",
'Upsilon;' => "\xce\xa5",
'Uuml' => "\xc3\x9c",
'Uuml;' => "\xc3\x9c",
'Xi;' => "\xce\x9e",
'Yacute' => "\xc3\x9d",
'Yacute;' => "\xc3\x9d",
'Yuml;' => "\xc5\xb8",
'Zeta;' => "\xce\x96",
'aacute' => "\xc3\xa1",
'aacute;' => "\xc3\xa1",
'acirc' => "\xc3\xa2",
'acirc;' => "\xc3\xa2",
'acute' => "\xc2\xb4",
'acute;' => "\xc2\xb4",
'aelig' => "\xc3\xa6",
'aelig;' => "\xc3\xa6",
'agrave' => "\xc3\xa0",
'agrave;' => "\xc3\xa0",
'alefsym;' => "\xe2\x84\xb5",
'alpha;' => "\xce\xb1",
'amp' => '&',
'amp;' => '&',
'and;' => "\xe2\x88\xa7",
'ang;' => "\xe2\x88\xa0",
'apos;' => "'",
'aring' => "\xc3\xa5",
'aring;' => "\xc3\xa5",
'asymp;' => "\xe2\x89\x88",
'atilde' => "\xc3\xa3",
'atilde;' => "\xc3\xa3",
'auml' => "\xc3\xa4",
'auml;' => "\xc3\xa4",
'bdquo;' => "\xe2\x80\x9e",
'beta;' => "\xce\xb2",
'brvbar' => "\xc2\xa6",
'brvbar;' => "\xc2\xa6",
'bull;' => "\xe2\x80\xa2",
'cap;' => "\xe2\x88\xa9",
'ccedil' => "\xc3\xa7",
'ccedil;' => "\xc3\xa7",
'cedil' => "\xc2\xb8",
'cedil;' => "\xc2\xb8",
'cent' => "\xc2\xa2",
'cent;' => "\xc2\xa2",
'chi;' => "\xcf\x87",
'circ;' => "\xcb\x86",
'clubs;' => "\xe2\x99\xa3",
'cong;' => "\xe2\x89\x85",
'copy' => "\xc2\xa9",
'copy;' => "\xc2\xa9",
'crarr;' => "\xe2\x86\xb5",
'cup;' => "\xe2\x88\xaa",
'curren' => "\xc2\xa4",
'curren;' => "\xc2\xa4",
'dArr;' => "\xe2\x87\x93",
'dagger;' => "\xe2\x80\xa0",
'darr;' => "\xe2\x86\x93",
'deg' => "\xc2\xb0",
'deg;' => "\xc2\xb0",
'delta;' => "\xce\xb4",
'diams;' => "\xe2\x99\xa6",
'divide' => "\xc3\xb7",
'divide;' => "\xc3\xb7",
'eacute' => "\xc3\xa9",
'eacute;' => "\xc3\xa9",
'ecirc' => "\xc3\xaa",
'ecirc;' => "\xc3\xaa",
'egrave' => "\xc3\xa8",
'egrave;' => "\xc3\xa8",
'empty;' => "\xe2\x88\x85",
'emsp;' => "\xe2\x80\x83",
'ensp;' => "\xe2\x80\x82",
'epsilon;' => "\xce\xb5",
'equiv;' => "\xe2\x89\xa1",
'eta;' => "\xce\xb7",
'eth' => "\xc3\xb0",
'eth;' => "\xc3\xb0",
'euml' => "\xc3\xab",
'euml;' => "\xc3\xab",
'euro;' => "\xe2\x82\xac",
'exist;' => "\xe2\x88\x83",
'fnof;' => "\xc6\x92",
'forall;' => "\xe2\x88\x80",
'frac12' => "\xc2\xbd",
'frac12;' => "\xc2\xbd",
'frac14' => "\xc2\xbc",
'frac14;' => "\xc2\xbc",
'frac34' => "\xc2\xbe",
'frac34;' => "\xc2\xbe",
'frasl;' => "\xe2\x81\x84",
'gamma;' => "\xce\xb3",
'ge;' => "\xe2\x89\xa5",
'gt' => '>',
'gt;' => '>',
'hArr;' => "\xe2\x87\x94",
'harr;' => "\xe2\x86\x94",
'hearts;' => "\xe2\x99\xa5",
'hellip;' => "\xe2\x80\xa6",
'iacute' => "\xc3\xad",
'iacute;' => "\xc3\xad",
'icirc' => "\xc3\xae",
'icirc;' => "\xc3\xae",
'iexcl' => "\xc2\xa1",
'iexcl;' => "\xc2\xa1",
'igrave' => "\xc3\xac",
'igrave;' => "\xc3\xac",
'image;' => "\xe2\x84\x91",
'infin;' => "\xe2\x88\x9e",
'int;' => "\xe2\x88\xab",
'iota;' => "\xce\xb9",
'iquest' => "\xc2\xbf",
'iquest;' => "\xc2\xbf",
'isin;' => "\xe2\x88\x88",
'iuml' => "\xc3\xaf",
'iuml;' => "\xc3\xaf",
'kappa;' => "\xce\xba",
'lArr;' => "\xe2\x87\x90",
'lambda;' => "\xce\xbb",
'lang;' => "\xe3\x80\x88",
'laquo' => "\xc2\xab",
'laquo;' => "\xc2\xab",
'larr;' => "\xe2\x86\x90",
'lceil;' => "\xe2\x8c\x88",
'ldquo;' => "\xe2\x80\x9c",
'le;' => "\xe2\x89\xa4",
'lfloor;' => "\xe2\x8c\x8a",
'lowast;' => "\xe2\x88\x97",
'loz;' => "\xe2\x97\x8a",
'lrm;' => "\xe2\x80\x8e",
'lsaquo;' => "\xe2\x80\xb9",
'lsquo;' => "\xe2\x80\x98",
'lt' => '<',
'lt;' => '<',
'macr' => "\xc2\xaf",
'macr;' => "\xc2\xaf",
'mdash;' => "\xe2\x80\x94",
'micro' => "\xc2\xb5",
'micro;' => "\xc2\xb5",
'middot' => "\xc2\xb7",
'middot;' => "\xc2\xb7",
'minus;' => "\xe2\x88\x92",
'mu;' => "\xce\xbc",
'nabla;' => "\xe2\x88\x87",
'nbsp' => "\xc2\xa0",
'nbsp;' => "\xc2\xa0",
'ndash;' => "\xe2\x80\x93",
'ne;' => "\xe2\x89\xa0",
'ni;' => "\xe2\x88\x8b",
'not' => "\xc2\xac",
'not;' => "\xc2\xac",
'notin;' => "\xe2\x88\x89",
'nsub;' => "\xe2\x8a\x84",
'ntilde' => "\xc3\xb1",
'ntilde;' => "\xc3\xb1",
'nu;' => "\xce\xbd",
'oacute' => "\xc3\xb3",
'oacute;' => "\xc3\xb3",
'ocirc' => "\xc3\xb4",
'ocirc;' => "\xc3\xb4",
'oelig;' => "\xc5\x93",
'ograve' => "\xc3\xb2",
'ograve;' => "\xc3\xb2",
'oline;' => "\xe2\x80\xbe",
'omega;' => "\xcf\x89",
'omicron;' => "\xce\xbf",
'oplus;' => "\xe2\x8a\x95",
'or;' => "\xe2\x88\xa8",
'ordf' => "\xc2\xaa",
'ordf;' => "\xc2\xaa",
'ordm' => "\xc2\xba",
'ordm;' => "\xc2\xba",
'oslash' => "\xc3\xb8",
'oslash;' => "\xc3\xb8",
'otilde' => "\xc3\xb5",
'otilde;' => "\xc3\xb5",
'otimes;' => "\xe2\x8a\x97",
'ouml' => "\xc3\xb6",
'ouml;' => "\xc3\xb6",
'para' => "\xc2\xb6",
'para;' => "\xc2\xb6",
'part;' => "\xe2\x88\x82",
'permil;' => "\xe2\x80\xb0",
'perp;' => "\xe2\x8a\xa5",
'phi;' => "\xcf\x86",
'pi;' => "\xcf\x80",
'piv;' => "\xcf\x96",
'plusmn' => "\xc2\xb1",
'plusmn;' => "\xc2\xb1",
'pound' => "\xc2\xa3",
'pound;' => "\xc2\xa3",
'prime;' => "\xe2\x80\xb2",
'prod;' => "\xe2\x88\x8f",
'prop;' => "\xe2\x88\x9d",
'psi;' => "\xcf\x88",
'quot' => '"',
'quot;' => '"',
'rArr;' => "\xe2\x87\x92",
'radic;' => "\xe2\x88\x9a",
'rang;' => "\xe3\x80\x89",
'raquo' => "\xc2\xbb",
'raquo;' => "\xc2\xbb",
'rarr;' => "\xe2\x86\x92",
'rceil;' => "\xe2\x8c\x89",
'rdquo;' => "\xe2\x80\x9d",
'real;' => "\xe2\x84\x9c",
'reg' => "\xc2\xae",
'reg;' => "\xc2\xae",
'rfloor;' => "\xe2\x8c\x8b",
'rho;' => "\xcf\x81",
'rlm;' => "\xe2\x80\x8f",
'rsaquo;' => "\xe2\x80\xba",
'rsquo;' => "\xe2\x80\x99",
'sbquo;' => "\xe2\x80\x9a",
'scaron;' => "\xc5\xa1",
'sdot;' => "\xe2\x8b\x85",
'sect' => "\xc2\xa7",
'sect;' => "\xc2\xa7",
'shy' => "\xc2\xad",
'shy;' => "\xc2\xad",
'sigma;' => "\xcf\x83",
'sigmaf;' => "\xcf\x82",
'sim;' => "\xe2\x88\xbc",
'spades;' => "\xe2\x99\xa0",
'sub;' => "\xe2\x8a\x82",
'sube;' => "\xe2\x8a\x86",
'sum;' => "\xe2\x88\x91",
'sup1' => "\xc2\xb9",
'sup1;' => "\xc2\xb9",
'sup2' => "\xc2\xb2",
'sup2;' => "\xc2\xb2",
'sup3' => "\xc2\xb3",
'sup3;' => "\xc2\xb3",
'sup;' => "\xe2\x8a\x83",
'supe;' => "\xe2\x8a\x87",
'szlig' => "\xc3\x9f",
'szlig;' => "\xc3\x9f",
'tau;' => "\xcf\x84",
'there4;' => "\xe2\x88\xb4",
'theta;' => "\xce\xb8",
'thetasym;' => "\xcf\x91",
'thinsp;' => "\xe2\x80\x89",
'thorn' => "\xc3\xbe",
'thorn;' => "\xc3\xbe",
'tilde;' => "\xcb\x9c",
'times' => "\xc3\x97",
'times;' => "\xc3\x97",
'trade;' => "\xe2\x84\xa2",
'uArr;' => "\xe2\x87\x91",
'uacute' => "\xc3\xba",
'uacute;' => "\xc3\xba",
'uarr;' => "\xe2\x86\x91",
'ucirc' => "\xc3\xbb",
'ucirc;' => "\xc3\xbb",
'ugrave' => "\xc3\xb9",
'ugrave;' => "\xc3\xb9",
'uml' => "\xc2\xa8",
'uml;' => "\xc2\xa8",
'upsih;' => "\xcf\x92",
'upsilon;' => "\xcf\x85",
'uuml' => "\xc3\xbc",
'uuml;' => "\xc3\xbc",
'weierp;' => "\xe2\x84\x98",
'xi;' => "\xce\xbe",
'yacute' => "\xc3\xbd",
'yacute;' => "\xc3\xbd",
'yen' => "\xc2\xa5",
'yen;' => "\xc2\xa5",
'yuml' => "\xc3\xbf",
'yuml;' => "\xc3\xbf",
'zeta;' => "\xce\xb6",
'zwj;' => "\xe2\x80\x8d",
'zwnj;' => "\xe2\x80\x8c"
}
ENCODINGS = %w[
ansi_x3.4-1968
iso-ir-6
ansi_x3.4-1986
iso_646.irv:1991
ascii
iso646-us
us-ascii
us
ibm367
cp367
csascii
ks_c_5601-1987
korean
iso-2022-kr
csiso2022kr
euc-kr
iso-2022-jp
csiso2022jp
iso-2022-jp-2
iso-ir-58
chinese
csiso58gb231280
iso_8859-1:1987
iso-ir-100
iso_8859-1
iso-8859-1
latin1
l1
ibm819
cp819
csisolatin1
iso_8859-2:1987
iso-ir-101
iso_8859-2
iso-8859-2
latin2
l2
csisolatin2
iso_8859-3:1988
iso-ir-109
iso_8859-3
iso-8859-3
latin3
l3
csisolatin3
iso_8859-4:1988
iso-ir-110
iso_8859-4
iso-8859-4
latin4
l4
csisolatin4
iso_8859-6:1987
iso-ir-127
iso_8859-6
iso-8859-6
ecma-114
asmo-708
arabic
csisolatinarabic
iso_8859-7:1987
iso-ir-126
iso_8859-7
iso-8859-7
elot_928
ecma-118
greek
greek8
csisolatingreek
iso_8859-8:1988
iso-ir-138
iso_8859-8
iso-8859-8
hebrew
csisolatinhebrew
iso_8859-5:1988
iso-ir-144
iso_8859-5
iso-8859-5
cyrillic
csisolatincyrillic
iso_8859-9:1989
iso-ir-148
iso_8859-9
iso-8859-9
latin5
l5
csisolatin5
iso-8859-10
iso-ir-157
l6
iso_8859-10:1992
csisolatin6
latin6
hp-roman8
roman8
r8
ibm037
cp037
csibm037
ibm424
cp424
csibm424
ibm437
cp437
437
cspc8codepage437
ibm500
cp500
csibm500
ibm775
cp775
cspc775baltic
ibm850
cp850
850
cspc850multilingual
ibm852
cp852
852
cspcp852
ibm855
cp855
855
csibm855
ibm857
cp857
857
csibm857
ibm860
cp860
860
csibm860
ibm861
cp861
861
cp-is
csibm861
ibm862
cp862
862
cspc862latinhebrew
ibm863
cp863
863
csibm863
ibm864
cp864
csibm864
ibm865
cp865
865
csibm865
ibm866
cp866
866
csibm866
ibm869
cp869
869
cp-gr
csibm869
ibm1026
cp1026
csibm1026
koi8-r
cskoi8r
koi8-u
big5-hkscs
ptcp154
csptcp154
pt154
cp154
utf-7
utf-16be
utf-16le
utf-16
utf-8
iso-8859-13
iso-8859-14
iso-ir-199
iso_8859-14:1998
iso_8859-14
latin8
iso-celtic
l8
iso-8859-15
iso_8859-15
iso-8859-16
iso-ir-226
iso_8859-16:2001
iso_8859-16
latin10
l10
gbk
cp936
ms936
gb18030
shift_jis
ms_kanji
csshiftjis
euc-jp
gb2312
big5
csbig5
windows-1250
windows-1251
windows-1252
windows-1253
windows-1254
windows-1255
windows-1256
windows-1257
windows-1258
tis-620
hz-gb-2312
]
end

View file

@ -0,0 +1 @@
require 'html5/filters/optionaltags'

View file

@ -1,7 +1,7 @@
require 'delegate'
require 'enumerator'
module HTML5lib
module HTML5
module Filters
class Base < SimpleDelegator
include Enumerable

View file

@ -1,6 +1,6 @@
require 'html5lib/filters/base'
require 'html5/filters/base'
module HTML5lib
module HTML5
module Filters
class InjectMetaCharset < Base
def initialize(source, encoding)

View file

@ -1,7 +1,7 @@
require 'html5lib/constants'
require 'html5lib/filters/base'
require 'html5/constants'
require 'html5/filters/base'
module HTML5lib
module HTML5
module Filters
class OptionalTagFilter < Base

View file

@ -1,7 +1,7 @@
require 'html5lib/filters/base'
require 'html5lib/sanitizer'
require 'html5/filters/base'
require 'html5/sanitizer'
module HTML5lib
module HTML5
module Filters
class HTMLSanitizeFilter < Base
include HTMLSanitizeModule

View file

@ -1,7 +1,7 @@
require 'html5lib/constants'
require 'html5lib/filters/base'
require 'html5/constants'
require 'html5/filters/base'
module HTML5lib
module HTML5
module Filters
class WhitespaceFilter < Base

View file

@ -1,246 +1,246 @@
require 'html5lib/constants'
require 'html5lib/tokenizer'
require 'html5lib/treebuilders/rexml'
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
require 'html5lib/html5parser/' + File.basename(path)
end
module HTML5lib
# Error in parsed document
class ParseError < Exception; end
class AssertionError < Exception; end
# HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
#
class HTMLParser
attr_accessor :phase, :firstStartTag, :innerHTML, :lastPhase, :insertFromTable
attr_reader :phases, :tokenizer, :tree, :errors
def self.parse(stream, options = {})
encoding = options.delete(:encoding)
new(options).parse(stream,encoding)
end
def self.parseFragment(stream, options = {})
container = options.delete(:container) || 'div'
encoding = options.delete(:encoding)
new(options).parseFragment(stream,container,encoding)
end
@@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
# :strict - raise an exception when a parse error is encountered
# :tree - a treebuilder class controlling the type of tree that will be
# returned. Built in treebuilders can be accessed through
# HTML5lib::TreeBuilders[treeType]
def initialize(options = {})
@strict = false
@errors = []
@tokenizer = HTMLTokenizer
@tree = TreeBuilders::REXML::TreeBuilder
options.each { |name, value| instance_variable_set("@#{name}", value) }
@tree = @tree.new
@phases = @@phases.inject({}) do |phases, phase_name|
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
phases[phase_name.to_sym] = HTML5lib.const_get(phase_class_name).new(self, @tree)
phases
end
end
def _parse(stream, innerHTML, encoding, container = 'div')
@tree.reset
@firstStartTag = false
@errors = []
@tokenizer = @tokenizer.class unless Class === @tokenizer
@tokenizer = @tokenizer.new(stream, :encoding => encoding,
:parseMeta => !innerHTML)
if innerHTML
case @innerHTML = container.downcase
when 'title', 'textarea'
@tokenizer.contentModelFlag = :RCDATA
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
@tokenizer.contentModelFlag = :CDATA
when 'plaintext'
@tokenizer.contentModelFlag = :PLAINTEXT
else
# contentModelFlag already is PCDATA
#@tokenizer.contentModelFlag = :PCDATA
end
@phase = @phases[:rootElement]
@phase.insertHtmlElement
resetInsertionMode
else
@innerHTML = false
@phase = @phases[:initial]
end
# We only seem to have InBodyPhase testcases where the following is
# relevant ... need others too
@lastPhase = nil
# XXX This is temporary for the moment so there isn't any other
# changes needed for the parser to work with the iterable tokenizer
@tokenizer.each do |token|
token = normalizeToken(token)
method = 'process%s' % token[:type]
case token[:type]
when :Characters, :SpaceCharacters, :Comment
@phase.send method, token[:data]
when :StartTag
@phase.send method, token[:name], token[:data]
when :EndTag
@phase.send method, token[:name]
when :Doctype
@phase.send method, token[:name], token[:publicId],
token[:systemId], token[:correct]
else
parseError(token[:data])
end
end
# When the loop finishes it's EOF
@phase.processEOF
end
# Parse a HTML document into a well-formed tree
#
# stream - a filelike object or string containing the HTML to be parsed
#
# The optional encoding parameter must be a string that indicates
# the encoding. If specified, that encoding will be used,
# regardless of any BOM or later declaration (such as in a meta
# element)
def parse(stream, encoding=nil)
_parse(stream, false, encoding)
return @tree.getDocument
end
# Parse a HTML fragment into a well-formed tree fragment
# container - name of the element we're setting the innerHTML property
# if set to nil, default to 'div'
#
# stream - a filelike object or string containing the HTML to be parsed
#
# The optional encoding parameter must be a string that indicates
# the encoding. If specified, that encoding will be used,
# regardless of any BOM or later declaration (such as in a meta
# element)
def parseFragment(stream, container='div', encoding=nil)
_parse(stream, true, encoding, container)
return @tree.getFragment
end
def parseError(data = 'XXX ERROR MESSAGE NEEDED')
# XXX The idea is to make data mandatory.
@errors.push([@tokenizer.stream.position, data])
raise ParseError if @strict
end
# HTML5 specific normalizations to the token stream
def normalizeToken(token)
if token[:type] == :EmptyTag
# When a solidus (/) is encountered within a tag name what happens
# depends on whether the current tag name matches that of a void
# element. If it matches a void element atheists did the wrong
# thing and if it doesn't it's wrong for everyone.
unless VOID_ELEMENTS.include?(token[:name])
parseError(_('Solidus (/) incorrectly placed in tag.'))
end
token[:type] = :StartTag
end
if token[:type] == :StartTag
token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
# We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
unless token[:data].empty?
data = token[:data].reverse.map { |attr, value| [attr.tr(ASCII_UPPERCASE, ASCII_LOWERCASE), value] }
token[:data] = Hash[*data.flatten]
end
elsif token[:type] == :EndTag
parseError(_('End tag contains unexpected attributes.')) unless token[:data].empty?
token[:name] = token[:name].downcase
end
return token
end
@@new_modes = {
'select' => :inSelect,
'td' => :inCell,
'th' => :inCell,
'tr' => :inRow,
'tbody' => :inTableBody,
'thead' => :inTableBody,
'tfoot' => :inTableBody,
'caption' => :inCaption,
'colgroup' => :inColumnGroup,
'table' => :inTable,
'head' => :inBody,
'body' => :inBody,
'frameset' => :inFrameset
}
def resetInsertionMode
# The name of this method is mostly historical. (It's also used in the
# specification.)
last = false
@tree.openElements.reverse.each do |node|
nodeName = node.name
if node == @tree.openElements[0]
last = true
unless ['td', 'th'].include?(nodeName)
# XXX
# assert @innerHTML
nodeName = @innerHTML
end
end
# Check for conditions that should only happen in the innerHTML
# case
if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName)
# XXX
# assert @innerHTML
end
if @@new_modes.has_key?(nodeName)
@phase = @phases[@@new_modes[nodeName]]
elsif nodeName == 'html'
@phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead]
elsif last
@phase = @phases[:inBody]
else
next
end
break
end
end
def _(string); string; end
end
end
require 'html5/constants'
require 'html5/tokenizer'
require 'html5/treebuilders/rexml'
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
require 'html5/html5parser/' + File.basename(path)
end
module HTML5
# Error in parsed document
class ParseError < Exception; end
class AssertionError < Exception; end
# HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
#
class HTMLParser
attr_accessor :phase, :firstStartTag, :innerHTML, :lastPhase, :insertFromTable
attr_reader :phases, :tokenizer, :tree, :errors
def self.parse(stream, options = {})
encoding = options.delete(:encoding)
new(options).parse(stream,encoding)
end
def self.parseFragment(stream, options = {})
container = options.delete(:container) || 'div'
encoding = options.delete(:encoding)
new(options).parseFragment(stream,container,encoding)
end
@@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
# :strict - raise an exception when a parse error is encountered
# :tree - a treebuilder class controlling the type of tree that will be
# returned. Built in treebuilders can be accessed through
# HTML5::TreeBuilders[treeType]
def initialize(options = {})
@strict = false
@errors = []
@tokenizer = HTMLTokenizer
@tree = TreeBuilders::REXML::TreeBuilder
options.each { |name, value| instance_variable_set("@#{name}", value) }
@tree = @tree.new
@phases = @@phases.inject({}) do |phases, phase_name|
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
phases
end
end
def _parse(stream, innerHTML, encoding, container = 'div')
@tree.reset
@firstStartTag = false
@errors = []
@tokenizer = @tokenizer.class unless Class === @tokenizer
@tokenizer = @tokenizer.new(stream, :encoding => encoding,
:parseMeta => !innerHTML)
if innerHTML
case @innerHTML = container.downcase
when 'title', 'textarea'
@tokenizer.contentModelFlag = :RCDATA
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
@tokenizer.contentModelFlag = :CDATA
when 'plaintext'
@tokenizer.contentModelFlag = :PLAINTEXT
else
# contentModelFlag already is PCDATA
#@tokenizer.contentModelFlag = :PCDATA
end
@phase = @phases[:rootElement]
@phase.insertHtmlElement
resetInsertionMode
else
@innerHTML = false
@phase = @phases[:initial]
end
# We only seem to have InBodyPhase testcases where the following is
# relevant ... need others too
@lastPhase = nil
# XXX This is temporary for the moment so there isn't any other
# changes needed for the parser to work with the iterable tokenizer
@tokenizer.each do |token|
token = normalizeToken(token)
method = 'process%s' % token[:type]
case token[:type]
when :Characters, :SpaceCharacters, :Comment
@phase.send method, token[:data]
when :StartTag
@phase.send method, token[:name], token[:data]
when :EndTag
@phase.send method, token[:name]
when :Doctype
@phase.send method, token[:name], token[:publicId],
token[:systemId], token[:correct]
else
parseError(token[:data])
end
end
# When the loop finishes it's EOF
@phase.processEOF
end
# Parse a HTML document into a well-formed tree
#
# stream - a filelike object or string containing the HTML to be parsed
#
# The optional encoding parameter must be a string that indicates
# the encoding. If specified, that encoding will be used,
# regardless of any BOM or later declaration (such as in a meta
# element)
def parse(stream, encoding=nil)
_parse(stream, false, encoding)
return @tree.getDocument
end
# Parse a HTML fragment into a well-formed tree fragment
# container - name of the element we're setting the innerHTML property
# if set to nil, default to 'div'
#
# stream - a filelike object or string containing the HTML to be parsed
#
# The optional encoding parameter must be a string that indicates
# the encoding. If specified, that encoding will be used,
# regardless of any BOM or later declaration (such as in a meta
# element)
def parseFragment(stream, container='div', encoding=nil)
_parse(stream, true, encoding, container)
return @tree.getFragment
end
def parseError(data = 'XXX ERROR MESSAGE NEEDED')
# XXX The idea is to make data mandatory.
@errors.push([@tokenizer.stream.position, data])
raise ParseError if @strict
end
# HTML5 specific normalizations to the token stream
def normalizeToken(token)
if token[:type] == :EmptyTag
# When a solidus (/) is encountered within a tag name what happens
# depends on whether the current tag name matches that of a void
# element. If it matches a void element atheists did the wrong
# thing and if it doesn't it's wrong for everyone.
unless VOID_ELEMENTS.include?(token[:name])
parseError(_('Solidus (/) incorrectly placed in tag.'))
end
token[:type] = :StartTag
end
if token[:type] == :StartTag
token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
# We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
unless token[:data].empty?
data = token[:data].reverse.map { |attr, value| [attr.tr(ASCII_UPPERCASE, ASCII_LOWERCASE), value] }
token[:data] = Hash[*data.flatten]
end
elsif token[:type] == :EndTag
parseError(_('End tag contains unexpected attributes.')) unless token[:data].empty?
token[:name] = token[:name].downcase
end
return token
end
@@new_modes = {
'select' => :inSelect,
'td' => :inCell,
'th' => :inCell,
'tr' => :inRow,
'tbody' => :inTableBody,
'thead' => :inTableBody,
'tfoot' => :inTableBody,
'caption' => :inCaption,
'colgroup' => :inColumnGroup,
'table' => :inTable,
'head' => :inBody,
'body' => :inBody,
'frameset' => :inFrameset
}
def resetInsertionMode
# The name of this method is mostly historical. (It's also used in the
# specification.)
last = false
@tree.openElements.reverse.each do |node|
nodeName = node.name
if node == @tree.openElements[0]
last = true
unless ['td', 'th'].include?(nodeName)
# XXX
# assert @innerHTML
nodeName = @innerHTML
end
end
# Check for conditions that should only happen in the innerHTML
# case
if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName)
# XXX
# assert @innerHTML
end
if @@new_modes.has_key?(nodeName)
@phase = @phases[@@new_modes[nodeName]]
elsif nodeName == 'html'
@phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead]
elsif last
@phase = @phases[:inBody]
else
next
end
break
end
end
def _(string); string; end
end
end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class AfterBodyPhase < Phase
handle_end 'html'

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class AfterFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#after3

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class AfterHeadPhase < Phase
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'

View file

@ -1,11 +1,11 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class BeforeHeadPhase < Phase
handle_start 'html', 'head'
handle_end %w( html head body br ) => 'ImplyHead'
handle_end %w( html head body br p ) => 'ImplyHead'
def processEOF
startTagHead('head', {})

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
@ -112,7 +112,7 @@ module HTML5lib
def startTagForm(name, attributes)
if @tree.formPointer
@parser.parseError('Unexpected start tag (form). Ignored.')
@parser.parseError(_('Unexpected start tag (form). Ignored.'))
else
endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes)
@ -129,9 +129,9 @@ module HTML5lib
if stopName.include?(node.name)
poppedNodes = (0..i).collect { @tree.openElements.pop }
if i >= 1
@parser.parseError("Missing end tag%s (%s)" % [
@parser.parseError(_("Missing end tag%s (%s)" % [
(i>1 ? 's' : ''),
poppedNodes.reverse.map {|item| item.name}.join(', ')])
poppedNodes.reverse.map {|item| item.name}.join(', ')]))
end
break
end
@ -251,7 +251,7 @@ module HTML5lib
end
def startTagIsindex(name, attributes)
@parser.parseError("Unexpected start tag isindex. Don't use it!")
@parser.parseError(_("Unexpected start tag isindex. Don't use it!"))
return if @tree.formPointer
processStartTag('form', {})
processStartTag('hr', {})
@ -311,8 +311,13 @@ module HTML5lib
def endTagP(name)
@tree.generateImpliedEndTags('p') if in_scope?('p')
@parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p'
@tree.openElements.pop while in_scope?('p')
@parser.parseError(_('Unexpected end tag (p).')) unless @tree.openElements[-1].name == 'p'
if in_scope?('p')
@tree.openElements.pop while in_scope?('p')
else
startTagCloseP('p', {})
endTagP('p')
end
end
def endTagBody(name)
@ -342,7 +347,7 @@ module HTML5lib
@tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
@parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag."))
end
if in_scope?(name)
@ -351,7 +356,14 @@ module HTML5lib
end
def endTagForm(name)
endTagBlock(name)
if in_scope?(name)
@tree.generateImpliedEndTags
end
if @tree.openElements[-1].name != name
@parser.parseError(_("End tag (form) seen too early. Ignored."))
else
@tree.openElements.pop
end
@tree.formPointer = nil
end
@ -361,7 +373,7 @@ module HTML5lib
@tree.generateImpliedEndTags(name)
unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
@parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag."))
end
end
@ -377,7 +389,7 @@ module HTML5lib
end
unless @tree.openElements[-1].name == name
@parser.parseError(("Unexpected end tag (#{name}). Expected other end tag."))
@parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag."))
end
HEADING_ELEMENTS.each do |element|

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InCaptionPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InCellPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InColumnGroupPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-column

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset

View file

@ -1,12 +1,12 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InHeadPhase < Phase
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
handle_end 'head'
handle_end %w( html body br ) => 'ImplyAfterHead'
handle_end %w( html body br p ) => 'ImplyAfterHead'
handle_end %w( title style script )
def processEOF

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InRowPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-row

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InSelectPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-select

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InTableBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InTablePhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class InitialPhase < Phase
# This phase deals with error handling as well which is currently not

View file

@ -1,4 +1,4 @@
module HTML5lib
module HTML5
# Base class for helper objects that implement each phase of processing.
#
# Handler methods should be in the following order (they can be omitted):

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class RootElementPhase < Phase
def processEOF

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase'
require 'html5/html5parser/phase'
module HTML5lib
module HTML5
class TrailingEndPhase < Phase
def processEOF

View file

@ -1,7 +1,7 @@
require 'stringio'
require 'html5lib/constants'
require 'html5/constants'
module HTML5lib
module HTML5
# Provides a unicode stream of characters to the HTMLTokenizer.
@ -10,7 +10,7 @@ module HTML5lib
class HTMLInputStream
attr_accessor :queue, :char_encoding
attr_accessor :queue, :char_encoding, :errors
# Initialises the HTMLInputStream.
#
@ -40,25 +40,31 @@ module HTML5lib
#Number of bytes to use when looking for a meta element with
#encoding information
@NUM_BYTES_META = 512
#Number of bytes to use when using detecting encoding using chardet
@NUM_BYTES_CHARDET = 256
#Number of bytes to use when reading content
@NUM_BYTES_BUFFER = 1024
#Encoding to use if no other information can be found
@DEFAULT_ENCODING = 'windows-1252'
#Detect encoding iff no explicit "transport level" encoding is supplied
if @encoding.nil? or not HTML5lib.is_valid_encoding(@encoding)
if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
@char_encoding = detect_encoding
else
@char_encoding = @encoding
end
# Read bytes from stream decoding them into Unicode
uString = @raw_stream.read
@buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
if @char_encoding == 'windows-1252'
@win1252 = true
elsif @char_encoding != 'utf-8'
begin
require 'iconv'
begin
uString = Iconv.iconv('utf-8', @char_encoding, uString).first
@buffer << @raw_stream.read unless @raw_stream.eof?
@buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
rescue
@win1252 = true
end
@ -67,10 +73,8 @@ module HTML5lib
end
end
# Convert the unicode string into a list to be used as the data stream
@data_stream = uString
@queue = []
@errors = []
# Reset position in the list to read from
@tell = 0
@ -109,9 +113,22 @@ module HTML5lib
begin
require 'rubygems'
require 'UniversalDetector' # gem install chardet
buffer = @raw_stream.read
encoding = UniversalDetector::chardet(buffer)['encoding']
seek(buffer, 0)
buffers = []
detector = UniversalDetector::Detector.instance
detector.reset
until @raw_stream.eof?
buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
break if !buffer or buffer.empty?
buffers << buffer
detector.feed(buffer)
break if detector.instance_eval {@done}
detector.instance_eval {
@_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
}
end
detector.close
encoding = detector.result['encoding']
seek(buffers*'', 0)
rescue LoadError
end
end
@ -242,14 +259,20 @@ module HTML5lib
unless @queue.empty?
return @queue.shift
else
c = @data_stream[@tell]
if @tell + 3 > @buffer.length and !@raw_stream.eof?
# read next block
@buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
@tell = 0
end
c = @buffer[@tell]
@tell += 1
case c
when 0x01 .. 0x7F
if c == 0x0D
# normalize newlines
@tell += 1 if @data_stream[@tell] == 0x0A
@tell += 1 if @buffer[@tell] == 0x0A
c = 0x0A
end
@ -276,7 +299,7 @@ module HTML5lib
when 0xC0 .. 0xFF
if @win1252
"\xC3" + (c-64).chr # convert to utf-8
elsif @data_stream[@tell-1 .. -1] =~ /^
elsif @buffer[@tell-1 .. @tell+3] =~ /^
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
@ -292,6 +315,8 @@ module HTML5lib
end
when 0x00
@errors.push('null character found in input stream, ' +
'replaced with U+FFFD')
[0xFFFD].pack('U') # null characters are invalid
else
@ -317,6 +342,10 @@ module HTML5lib
@queue.insert(0, c) unless c == :EOF
return char_stack.join('')
end
def unget(characters)
@queue.unshift(*characters.to_a) unless characters == :EOF
end
end
# String-like object with an assosiated position and various extra methods
@ -433,14 +462,14 @@ module HTML5lib
if attr[0] == 'charset'
tentative_encoding = attr[1]
if HTML5lib.is_valid_encoding(tentative_encoding)
if HTML5.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding
return false
end
elsif attr[0] == 'content'
content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
tentative_encoding = content_parser.parse
if HTML5lib.is_valid_encoding(tentative_encoding)
if HTML5.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding
return false
end

View file

@ -11,10 +11,10 @@
#
# @@TODO:
# * Selectively lowercase only XHTML, but not foreign markup
require 'html5lib/html5parser'
require 'html5lib/constants'
require 'html5/html5parser'
require 'html5/constants'
module HTML5lib
module HTML5
# liberal XML parser
class XMLParser < HTMLParser
@ -25,25 +25,35 @@ module HTML5lib
end
def normalizeToken(token)
if token[:type] == :StartTag or token[:type] == :EmptyTag
case token[:type]
when :StartTag, :EmptyTag
# We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
# to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
token[:data] = Hash[*token[:data].reverse.flatten]
# For EmptyTags, process both a Start and an End tag
if token[:type] == :EmptyTag
save = @tokenizer.contentModelFlag
@phase.processStartTag(token[:name], token[:data])
@tokenizer.contentModelFlag = save
token[:data] = {}
token[:type] = :EndTag
end
elsif token[:type] == :EndTag
when :Characters
# un-escape RCDATA_ELEMENTS (e.g. style, script)
if @tokenizer.contentModelFlag == :CDATA
token[:data] = token[:data].
gsub('&lt;','<').gsub('&gt;','>').gsub('&amp;','&')
end
when :EndTag
if token[:data]
parseError(_("End tag contains unexpected attributes."))
end
elsif token[:type] == :Comment
when :Comment
# Rescue CDATA from the comments
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
token[:type] = :Characters

View file

@ -1,6 +1,7 @@
require 'cgi'
require 'html5/tokenizer'
module HTML5lib
module HTML5
# This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes.

View file

@ -0,0 +1,2 @@
require 'html5/serializer/htmlserializer'
require 'html5/serializer/xhtmlserializer'

View file

@ -1,6 +1,6 @@
require 'html5lib/constants'
require 'html5/constants'
module HTML5lib
module HTML5
class HTMLSerializer
@ -21,6 +21,7 @@ module HTML5lib
@use_trailing_solidus = false
@space_before_trailing_solidus = true
@escape_lt_in_attrs = false
@escape_rcdata = false
@omit_optional_tags = true
@sanitize = false
@ -43,22 +44,22 @@ module HTML5lib
@errors = []
if encoding and @inject_meta_charset
require 'html5lib/filters/inject_meta_charset'
require 'html5/filters/inject_meta_charset'
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
end
if @strip_whitespace
require 'html5lib/filters/whitespace'
require 'html5/filters/whitespace'
treewalker = Filters::WhitespaceFilter.new(treewalker)
end
if @sanitize
require 'html5lib/filters/sanitizer'
require 'html5/filters/sanitizer'
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
end
if @omit_optional_tags
require 'html5lib/filters/optionaltags'
require 'html5/filters/optionaltags'
treewalker = Filters::OptionalTagFilter.new(treewalker)
end
@ -81,7 +82,7 @@ module HTML5lib
elsif [:StartTag, :EmptyTag].include? type
name = token[:name]
if RCDATA_ELEMENTS.include?(name)
if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata
in_cdata = true
elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element"))

View file

@ -1,6 +1,6 @@
require 'html5lib/serializer/htmlserializer'
require 'html5/serializer/htmlserializer'
module HTML5lib
module HTML5
class XHTMLSerializer < HTMLSerializer
DEFAULTS = {
@ -8,7 +8,8 @@ module HTML5lib
:minimize_boolean_attributes => false,
:use_trailing_solidus => true,
:escape_lt_in_attrs => true,
:omit_optional_tags => false
:omit_optional_tags => false,
:escape_rcdata => true
}
def initialize(options={})

View file

@ -1,7 +1,7 @@
require 'html5lib/constants'
require 'html5lib/inputstream'
require 'html5/constants'
require 'html5/inputstream'
module HTML5lib
module HTML5
# This class takes care of tokenizing HTML.
#
@ -84,9 +84,9 @@ module HTML5lib
# Start processing. When EOF is reached @state will return false
# instead of true and the loop will terminate.
while send @state
while not @tokenQueue.empty?
yield @tokenQueue.shift
end
yield :type => :ParseError, :data => @stream.errors.shift until
@stream.errors.empty?
yield @tokenQueue.shift until @tokenQueue.empty?
end
end
@ -109,7 +109,7 @@ module HTML5lib
# The character we just consumed need to be put back on the stack so it
# doesn't get lost...
@stream.queue.push(data)
@stream.unget(data)
end
# This function returns either U+FFFD or the character based on the
@ -128,7 +128,6 @@ module HTML5lib
radix = 16
end
char = [0xFFFD].pack('U')
charStack = []
# Consume all the characters that are in range while making sure we
@ -142,17 +141,25 @@ module HTML5lib
# Convert the set of characters consumed to an int.
charAsInt = charStack.join('').to_i(radix)
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
# smaller) we need to do the "windows trick".
if (127...160).include? charAsInt
if charAsInt == 13
@tokenQueue.push({:type => :ParseError, :data =>
_("Incorrect CR newline entity. Replaced with LF.")})
charAsInt = 10
elsif (128..159).include? charAsInt
# If the integer is between 127 and 160 (so 128 and bigger and 159
# and smaller) we need to do the "windows trick".
@tokenQueue.push({:type => :ParseError, :data =>
_("Entity used with illegal number (windows-1252 reference).")})
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
end
if charAsInt > 0 and charAsInt <= 1114111
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
char = [charAsInt].pack('U')
else
char = [0xFFFD].pack('U')
@tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity represents an illegal codepoint.")})
end
# Discard the ; if present. Otherwise, put it back on the queue and
@ -160,18 +167,18 @@ module HTML5lib
if c != ";"
@tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity didn't end with ';'.")})
@stream.queue.push(c)
@stream.unget(c)
end
return char
end
def consumeEntity
def consumeEntity(from_attribute=false)
char = nil
charStack = [@stream.char]
if SPACE_CHARACTERS.include?(charStack[0]) or
[:EOF, '<', '&'].include?(charStack[0])
@stream.queue+= charStack
@stream.unget(charStack)
elsif charStack[0] == "#"
# We might have a number entity here.
charStack += [@stream.char, @stream.char]
@ -179,22 +186,22 @@ module HTML5lib
# If we reach the end of the file put everything up to :EOF
# back in the queue
charStack = charStack[0...charStack.index(:EOF)]
@stream.queue+= charStack
@stream.unget(charStack)
@tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity expected. Got end of file instead.")})
else
if charStack[1].downcase == "x" \
and HEX_DIGITS.include? charStack[2]
# Hexadecimal entity detected.
@stream.queue.push(charStack[2])
@stream.unget(charStack[2])
char = consumeNumberEntity(true)
elsif DIGITS.include? charStack[1]
# Decimal entity detected.
@stream.queue += charStack[1..-1]
@stream.unget(charStack[1..-1])
char = consumeNumberEntity(false)
else
# No number entity detected.
@stream.queue += charStack
@stream.unget(charStack)
@tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity expected but none found.")})
end
@ -209,6 +216,8 @@ module HTML5lib
filteredEntityList.reject! {|e| e[0].chr != charStack[0]}
entityName = nil
# Try to find the longest entity the string will match to take care
# of &noti for instance.
while charStack[-1] != :EOF
name = charStack.join('')
if filteredEntityList.any? {|e| e[0...name.length] == name}
@ -220,6 +229,7 @@ module HTML5lib
if ENTITIES.include? name
entityName = name
break if entityName[-1] == ';'
end
end
@ -228,15 +238,23 @@ module HTML5lib
# Check whether or not the last character returned can be
# discarded or needs to be put back.
if not charStack[-1] == ";"
if entityName[-1] != ?;
@tokenQueue.push({:type => :ParseError, :data =>
_("Named entity didn't end with ';'.")})
@stream.queue += charStack[entityName.length..-1]
end
if charStack[-1] != ";" and from_attribute and
(ASCII_LETTERS.include?(charStack[entityName.length]) or
DIGITS.include?(charStack[entityName.length]))
@stream.unget(charStack)
char = '&'
else
@stream.unget(charStack[entityName.length..-1])
end
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Named entity expected. Got none.")})
@stream.queue += charStack
@stream.unget(charStack)
end
end
return char
@ -244,7 +262,7 @@ module HTML5lib
# This method replaces the need for "entityInAttributeValueState".
def processEntityInAttribute
entity = consumeEntity
entity = consumeEntity(true)
if entity
@currentToken[:data][-1][1] += entity
else
@ -274,20 +292,23 @@ module HTML5lib
@lastFourChars.shift if @lastFourChars.length > 4
end
if data == "&" and [:PCDATA,:RCDATA].include?(@contentModelFlag)
@state = @states[:entityData]
if data == "&" and !@escapeFlag and
[:PCDATA,:RCDATA].include?(@contentModelFlag)
@state = @states[:entityData]
elsif data == "-" and [:CDATA,:RCDATA].include?(@contentModelFlag) and
@escapeFlag == false and @lastFourChars.join('') == "<!--"
elsif data == "-" and !@escapeFlag and
[:CDATA,:RCDATA].include?(@contentModelFlag) and
@lastFourChars.join('') == "<!--"
@escapeFlag = true
@tokenQueue.push({:type => :Characters, :data => data})
elsif data == "<" and @escapeFlag == false and
elsif data == "<" and !@escapeFlag and
[:PCDATA,:CDATA,:RCDATA].include?(@contentModelFlag)
@state = @states[:tagOpen]
elsif data == ">" and [:CDATA,:RCDATA].include?(@contentModelFlag) and
@escapeFlag == true and @lastFourChars[1..-1].join('') == "-->"
elsif data == ">" and @escapeFlag and
[:CDATA,:RCDATA].include?(@contentModelFlag) and
@lastFourChars[1..-1].join('') == "-->"
@escapeFlag = false
@tokenQueue.push({:type => :Characters, :data => data})
@ -345,14 +366,14 @@ module HTML5lib
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected tag name. Got '?' instead (HTML doesn't " +
"support processing instructions).")})
@stream.queue.push(data)
@stream.unget(data)
@state = @states[:bogusComment]
else
# XXX
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected tag name. Got something else instead")})
@tokenQueue.push({:type => :Characters, :data => "<"})
@stream.queue.push(data)
@stream.unget(data)
@state = @states[:data]
end
else
@ -363,7 +384,7 @@ module HTML5lib
@state = @states[:closeTagOpen]
else
@tokenQueue.push({:type => :Characters, :data => "<"})
@stream.queue.insert(0, data)
@stream.unget(data)
@state = @states[:data]
end
end
@ -388,7 +409,7 @@ module HTML5lib
# Since this is just for checking. We put the characters back on
# the stack.
@stream.queue += charStack
@stream.unget(charStack)
end
if @currentToken and
@ -426,7 +447,7 @@ module HTML5lib
# XXX data can be _'_...
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag. Unexpected character '#{data}' found.")})
@stream.queue.push(data)
@stream.unget(data)
@state = @states[:bogusComment]
end
@ -556,7 +577,7 @@ module HTML5lib
@state = @states[:attributeValueDoubleQuoted]
elsif data == "&"
@state = @states[:attributeValueUnQuoted]
@stream.queue.push(data);
@stream.unget(data);
elsif data == "'"
@state = @states[:attributeValueSingleQuoted]
elsif data == ">"
@ -656,7 +677,7 @@ module HTML5lib
else
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected '--' or 'DOCTYPE'. Not found.")})
@stream.queue += charStack
@stream.unget(charStack)
@state = @states[:bogusComment]
end
end
@ -771,7 +792,7 @@ module HTML5lib
else
@tokenQueue.push({:type => :ParseError, :data =>
_("No space after literal string 'DOCTYPE'.")})
@stream.queue.push(data)
@stream.unget(data)
@state = @states[:beforeDoctypeName]
end
return true
@ -827,7 +848,7 @@ module HTML5lib
@state = @states[:data]
elsif data == :EOF
@currentToken[:data] = true
@stream.queue.push(data)
@stream.unget(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false
@ -842,7 +863,7 @@ module HTML5lib
elsif token == "system"
@state = @states[:beforeDoctypeSystemIdentifier]
else
@stream.queue += charStack
@stream.unget(charStack)
@tokenQueue.push({:type => :ParseError, :data =>
_("Expected 'public' or 'system'. Got '#{charStack.join('')}'")})
@state = @states[:bogusDoctype]
@ -1028,7 +1049,7 @@ module HTML5lib
@state = @states[:data]
elsif data == :EOF
# XXX EMIT
@stream.queue.push(data)
@stream.unget(data)
@tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in bogus doctype.")})
@currentToken[:correct] = false

View file

@ -1,17 +1,17 @@
module HTML5lib
module HTML5
module TreeBuilders
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treebuilders/simpletree'
require 'html5/treebuilders/simpletree'
SimpleTree::TreeBuilder
when 'rexml' then
require 'html5lib/treebuilders/rexml'
require 'html5/treebuilders/rexml'
REXML::TreeBuilder
when 'hpricot' then
require 'html5lib/treebuilders/hpricot'
require 'html5/treebuilders/hpricot'
Hpricot::TreeBuilder
else
raise "Unknown TreeBuilder #{name}"

View file

@ -1,8 +1,8 @@
require 'html5lib/constants'
require 'html5/constants'
#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
module HTML5lib
module HTML5
# The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting

View file

@ -1,221 +1,221 @@
require 'html5lib/treebuilders/base'
require 'rubygems'
require 'hpricot'
require 'forwardable'
module HTML5lib
module TreeBuilders
module Hpricot
class Node < Base::Node
extend Forwardable
def_delegators :@hpricot, :name
attr_accessor :hpricot
def initialize(name)
super(name)
@hpricot = self.class.hpricot_class.new name
end
def appendChild(node)
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
else
childNodes << node
hpricot.children << node.hpricot
end
if (oldparent = node.hpricot.parent) != nil
oldparent.children.delete_at(oldparent.children.index(node.hpricot))
end
node.hpricot.parent = hpricot
node.parent = self
end
def removeChild(node)
childNodes.delete(node)
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
node.hpricot.parent = nil
node.parent = nil
end
def insertText(data, before=nil)
if before
insertBefore(TextNode.new(data), before)
else
appendChild(TextNode.new(data))
end
end
def insertBefore(node, refNode)
index = childNodes.index(refNode)
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
else
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
childNodes.insert(index, node)
end
end
def hasContent
childNodes.any?
end
end
class Element < Node
def self.hpricot_class
::Hpricot::Elem
end
def initialize(name)
super(name)
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
end
def name
@hpricot.stag.name
end
def cloneNode
attributes.inject(self.class.new(name)) do |node, (name, value)|
node.hpricot[name] = value
node
end
end
# A call to Hpricot::Elem#raw_attributes is built dynamically,
# so alterations to the returned value (a hash) will be lost.
#
# AttributeProxy works around this by forwarding :[]= calls
# to the raw_attributes accessor on the element start tag.
#
class AttributeProxy
def initialize(hpricot)
@hpricot = hpricot
end
def []=(k, v)
@hpricot.stag.send(stag_attributes_method)[k] = v
end
def stag_attributes_method
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
end
def method_missing(*a, &b)
@hpricot.attributes.send(*a, &b)
end
end
def attributes
AttributeProxy.new(@hpricot)
end
def attributes=(attrs)
attrs.each { |name, value| @hpricot[name] = value }
end
def printTree(indent=0)
tree = "\n|#{' ' * indent}<#{name}>"
indent += 2
attributes.each do |name, value|
next if name == 'xmlns'
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
end
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
end
end
class Document < Node
def self.hpricot_class
::Hpricot::Doc
end
def initialize
super(nil)
end
def printTree(indent=0)
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
end
end
class DocumentType < Node
def self.hpricot_class
::Hpricot::DocType
end
def initialize(name)
begin
super(name)
rescue ArgumentError # needs 3...
end
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
end
def printTree(indent=0)
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
end
end
class DocumentFragment < Element
def initialize
super('')
end
def printTree(indent=0)
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
end
end
class TextNode < Node
def initialize(data)
@hpricot = ::Hpricot::Text.new(data)
end
def printTree(indent=0)
"\n|#{' ' * indent}\"#{hpricot.content}\""
end
end
class CommentNode < Node
def self.hpricot_class
::Hpricot::Comment
end
def printTree(indent=0)
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer(node)
node.printTree
end
def getDocument
@document.hpricot
end
def getFragment
@document = super
return @document.hpricot.children
end
end
end
end
end
require 'html5/treebuilders/base'
require 'rubygems'
require 'hpricot'
require 'forwardable'
module HTML5
module TreeBuilders
module Hpricot
class Node < Base::Node
extend Forwardable
def_delegators :@hpricot, :name
attr_accessor :hpricot
def initialize(name)
super(name)
@hpricot = self.class.hpricot_class.new name
end
def appendChild(node)
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
else
childNodes << node
hpricot.children << node.hpricot
end
if (oldparent = node.hpricot.parent) != nil
oldparent.children.delete_at(oldparent.children.index(node.hpricot))
end
node.hpricot.parent = hpricot
node.parent = self
end
def removeChild(node)
childNodes.delete(node)
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
node.hpricot.parent = nil
node.parent = nil
end
def insertText(data, before=nil)
if before
insertBefore(TextNode.new(data), before)
else
appendChild(TextNode.new(data))
end
end
def insertBefore(node, refNode)
index = childNodes.index(refNode)
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
else
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
childNodes.insert(index, node)
end
end
def hasContent
childNodes.any?
end
end
class Element < Node
def self.hpricot_class
::Hpricot::Elem
end
def initialize(name)
super(name)
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
end
def name
@hpricot.stag.name
end
def cloneNode
attributes.inject(self.class.new(name)) do |node, (name, value)|
node.hpricot[name] = value
node
end
end
# A call to Hpricot::Elem#raw_attributes is built dynamically,
# so alterations to the returned value (a hash) will be lost.
#
# AttributeProxy works around this by forwarding :[]= calls
# to the raw_attributes accessor on the element start tag.
#
class AttributeProxy
def initialize(hpricot)
@hpricot = hpricot
end
def []=(k, v)
@hpricot.stag.send(stag_attributes_method)[k] = v
end
def stag_attributes_method
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
end
def method_missing(*a, &b)
@hpricot.attributes.send(*a, &b)
end
end
def attributes
AttributeProxy.new(@hpricot)
end
def attributes=(attrs)
attrs.each { |name, value| @hpricot[name] = value }
end
def printTree(indent=0)
tree = "\n|#{' ' * indent}<#{name}>"
indent += 2
attributes.each do |name, value|
next if name == 'xmlns'
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
end
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
end
end
class Document < Node
def self.hpricot_class
::Hpricot::Doc
end
def initialize
super(nil)
end
def printTree(indent=0)
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
end
end
class DocumentType < Node
def self.hpricot_class
::Hpricot::DocType
end
def initialize(name)
begin
super(name)
rescue ArgumentError # needs 3...
end
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
end
def printTree(indent=0)
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
end
end
class DocumentFragment < Element
def initialize
super('')
end
def printTree(indent=0)
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
end
end
class TextNode < Node
def initialize(data)
@hpricot = ::Hpricot::Text.new(data)
end
def printTree(indent=0)
"\n|#{' ' * indent}\"#{hpricot.content}\""
end
end
class CommentNode < Node
def self.hpricot_class
::Hpricot::Comment
end
def printTree(indent=0)
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
end
end
class TreeBuilder < Base::TreeBuilder
def initialize
@documentClass = Document
@doctypeClass = DocumentType
@elementClass = Element
@commentClass = CommentNode
@fragmentClass = DocumentFragment
end
def testSerializer(node)
node.printTree
end
def getDocument
@document.hpricot
end
def getFragment
@document = super
return @document.hpricot.children
end
end
end
end
end

View file

@ -1,8 +1,8 @@
require 'html5lib/treebuilders/base'
require 'html5/treebuilders/base'
require 'rexml/document'
require 'forwardable'
module HTML5lib
module HTML5
module TreeBuilders
module REXML

View file

@ -1,6 +1,6 @@
require 'html5lib/treebuilders/base'
require 'html5/treebuilders/base'
module HTML5lib
module HTML5
module TreeBuilders
module SimpleTree

View file

@ -1,19 +1,19 @@
require 'html5lib/treewalkers/base'
require 'html5/treewalkers/base'
module HTML5lib
module HTML5
module TreeWalkers
class << self
def [](name)
case name.to_s.downcase
when 'simpletree' then
require 'html5lib/treewalkers/simpletree'
require 'html5/treewalkers/simpletree'
SimpleTree::TreeWalker
when 'rexml' then
require 'html5lib/treewalkers/rexml'
require 'html5/treewalkers/rexml'
REXML::TreeWalker
when 'hpricot' then
require 'html5lib/treewalkers/hpricot'
require 'html5/treewalkers/hpricot'
Hpricot::TreeWalker
else
raise "Unknown TreeWalker #{name}"

View file

@ -1,5 +1,5 @@
require 'html5lib/constants'
module HTML5lib
require 'html5/constants'
module HTML5
module TreeWalkers
module TokenConstructor

View file

@ -1,10 +1,10 @@
require 'html5lib/treewalkers/base'
require 'html5/treewalkers/base'
require 'rexml/document'
module HTML5lib
module HTML5
module TreeWalkers
module Hpricot
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node

View file

@ -1,10 +1,10 @@
require 'html5lib/treewalkers/base'
require 'html5/treewalkers/base'
require 'rexml/document'
module HTML5lib
module HTML5
module TreeWalkers
module REXML
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
def node_details(node)
case node

View file

@ -1,10 +1,10 @@
require 'html5lib/treewalkers/base'
require 'html5/treewalkers/base'
module HTML5lib
module HTML5
module TreeWalkers
module SimpleTree
class TreeWalker < HTML5lib::TreeWalkers::Base
include HTML5lib::TreeBuilders::SimpleTree
class TreeWalker < HTML5::TreeWalkers::Base
include HTML5::TreeBuilders::SimpleTree
def walk(node)
case node

View file

@ -1,708 +0,0 @@
module HTML5lib
class EOF < Exception; end
CONTENT_MODEL_FLAGS = [
:PCDATA,
:RCDATA,
:CDATA,
:PLAINTEXT
]
SCOPING_ELEMENTS = %w[
button
caption
html
marquee
object
table
td
th
]
FORMATTING_ELEMENTS = %w[
a
b
big
em
font
i
nobr
s
small
strike
strong
tt
u
]
SPECIAL_ELEMENTS = %w[
address
area
base
basefont
bgsound
blockquote
body
br
center
col
colgroup
dd
dir
div
dl
dt
embed
fieldset
form
frame
frameset
h1
h2
h3
h4
h5
h6
head
hr
iframe
image
img
input
isindex
li
link
listing
menu
meta
noembed
noframes
noscript
ol
optgroup
option
p
param
plaintext
pre
script
select
spacer
style
tbody
textarea
tfoot
thead
title
tr
ul
wbr
]
SPACE_CHARACTERS = %W[
\t
\n
\x0B
\x0C
\x20
\r
]
TABLE_INSERT_MODE_ELEMENTS = %w[
table
tbody
tfoot
thead
tr
]
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
DIGITS = '0'..'9'
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
# Heading elements need to be ordered
HEADING_ELEMENTS = %w[
h1
h2
h3
h4
h5
h6
]
# XXX What about event-source and command?
VOID_ELEMENTS = %w[
base
link
meta
hr
br
img
embed
param
area
col
input
]
CDATA_ELEMENTS = %w[title textarea]
RCDATA_ELEMENTS = %w[
style
script
xmp
iframe
noembed
noframes
noscript
]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],
'img' => %w[ismap],
'audio' => %w[autoplay controls],
'video' => %w[autoplay controls],
'script' => %w[defer async],
'details' => %w[open],
'datagrid' => %w[multiple disabled],
'command' => %w[hidden disabled checked default],
'menu' => %w[autosubmit],
'fieldset' => %w[disabled readonly],
'option' => %w[disabled readonly selected],
'optgroup' => %w[disabled readonly],
'button' => %w[disabled autofocus],
'input' => %w[disabled readonly required autofocus checked ismap],
'select' => %w[disabled readonly autofocus multiple],
'output' => %w[disabled readonly]
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN
65533, # 0x81 UNDEFINED
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
8224, # 0x86 0x2020 DAGGER
8225, # 0x87 0x2021 DOUBLE DAGGER
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
8240, # 0x89 0x2030 PER MILLE SIGN
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
65533, # 0x8D UNDEFINED
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
65533, # 0x8F UNDEFINED
65533, # 0x90 UNDEFINED
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
8226, # 0x95 0x2022 BULLET
8211, # 0x96 0x2013 EN DASH
8212, # 0x97 0x2014 EM DASH
732, # 0x98 0x02DC SMALL TILDE
8482, # 0x99 0x2122 TRADE MARK SIGN
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
65533, # 0x9D UNDEFINED
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
]
private
def self.U n
[n].pack('U')
end
public
ENTITIES = {
"AElig" => U(0xC6),
"Aacute" => U(0xC1),
"Acirc" => U(0xC2),
"Agrave" => U(0xC0),
"Alpha" => U(0x0391),
"Aring" => U(0xC5),
"Atilde" => U(0xC3),
"Auml" => U(0xC4),
"Beta" => U(0x0392),
"Ccedil" => U(0xC7),
"Chi" => U(0x03A7),
"Dagger" => U(0x2021),
"Delta" => U(0x0394),
"ETH" => U(0xD0),
"Eacute" => U(0xC9),
"Ecirc" => U(0xCA),
"Egrave" => U(0xC8),
"Epsilon" => U(0x0395),
"Eta" => U(0x0397),
"Euml" => U(0xCB),
"Gamma" => U(0x0393),
"Iacute" => U(0xCD),
"Icirc" => U(0xCE),
"Igrave" => U(0xCC),
"Iota" => U(0x0399),
"Iuml" => U(0xCF),
"Kappa" => U(0x039A),
"Lambda" => U(0x039B),
"Mu" => U(0x039C),
"Ntilde" => U(0xD1),
"Nu" => U(0x039D),
"OElig" => U(0x0152),
"Oacute" => U(0xD3),
"Ocirc" => U(0xD4),
"Ograve" => U(0xD2),
"Omega" => U(0x03A9),
"Omicron" => U(0x039F),
"Oslash" => U(0xD8),
"Otilde" => U(0xD5),
"Ouml" => U(0xD6),
"Phi" => U(0x03A6),
"Pi" => U(0x03A0),
"Prime" => U(0x2033),
"Psi" => U(0x03A8),
"Rho" => U(0x03A1),
"Scaron" => U(0x0160),
"Sigma" => U(0x03A3),
"THORN" => U(0xDE),
"Tau" => U(0x03A4),
"Theta" => U(0x0398),
"Uacute" => U(0xDA),
"Ucirc" => U(0xDB),
"Ugrave" => U(0xD9),
"Upsilon" => U(0x03A5),
"Uuml" => U(0xDC),
"Xi" => U(0x039E),
"Yacute" => U(0xDD),
"Yuml" => U(0x0178),
"Zeta" => U(0x0396),
"aacute" => U(0xE1),
"acirc" => U(0xE2),
"acute" => U(0xB4),
"aelig" => U(0xE6),
"agrave" => U(0xE0),
"alefsym" => U(0x2135),
"alpha" => U(0x03B1),
"amp" => U(0x26),
"AMP" => U(0x26),
"and" => U(0x2227),
"ang" => U(0x2220),
"apos" => U(0x27),
"aring" => U(0xE5),
"asymp" => U(0x2248),
"atilde" => U(0xE3),
"auml" => U(0xE4),
"bdquo" => U(0x201E),
"beta" => U(0x03B2),
"brvbar" => U(0xA6),
"bull" => U(0x2022),
"cap" => U(0x2229),
"ccedil" => U(0xE7),
"cedil" => U(0xB8),
"cent" => U(0xA2),
"chi" => U(0x03C7),
"circ" => U(0x02C6),
"clubs" => U(0x2663),
"cong" => U(0x2245),
"copy" => U(0xA9),
"COPY" => U(0xA9),
"crarr" => U(0x21B5),
"cup" => U(0x222A),
"curren" => U(0xA4),
"dArr" => U(0x21D3),
"dagger" => U(0x2020),
"darr" => U(0x2193),
"deg" => U(0xB0),
"delta" => U(0x03B4),
"diams" => U(0x2666),
"divide" => U(0xF7),
"eacute" => U(0xE9),
"ecirc" => U(0xEA),
"egrave" => U(0xE8),
"empty" => U(0x2205),
"emsp" => U(0x2003),
"ensp" => U(0x2002),
"epsilon" => U(0x03B5),
"equiv" => U(0x2261),
"eta" => U(0x03B7),
"eth" => U(0xF0),
"euml" => U(0xEB),
"euro" => U(0x20AC),
"exist" => U(0x2203),
"fnof" => U(0x0192),
"forall" => U(0x2200),
"frac12" => U(0xBD),
"frac14" => U(0xBC),
"frac34" => U(0xBE),
"frasl" => U(0x2044),
"gamma" => U(0x03B3),
"ge" => U(0x2265),
"gt" => U(0x3E),
"GT" => U(0x3E),
"hArr" => U(0x21D4),
"harr" => U(0x2194),
"hearts" => U(0x2665),
"hellip" => U(0x2026),
"iacute" => U(0xED),
"icirc" => U(0xEE),
"iexcl" => U(0xA1),
"igrave" => U(0xEC),
"image" => U(0x2111),
"infin" => U(0x221E),
"int" => U(0x222B),
"iota" => U(0x03B9),
"iquest" => U(0xBF),
"isin" => U(0x2208),
"iuml" => U(0xEF),
"kappa" => U(0x03BA),
"lArr" => U(0x21D0),
"lambda" => U(0x03BB),
"lang" => U(0x2329),
"laquo" => U(0xAB),
"larr" => U(0x2190),
"lceil" => U(0x2308),
"ldquo" => U(0x201C),
"le" => U(0x2264),
"lfloor" => U(0x230A),
"lowast" => U(0x2217),
"loz" => U(0x25CA),
"lrm" => U(0x200E),
"lsaquo" => U(0x2039),
"lsquo" => U(0x2018),
"lt" => U(0x3C),
"LT" => U(0x3C),
"macr" => U(0xAF),
"mdash" => U(0x2014),
"micro" => U(0xB5),
"middot" => U(0xB7),
"minus" => U(0x2212),
"mu" => U(0x03BC),
"nabla" => U(0x2207),
"nbsp" => U(0xA0),
"ndash" => U(0x2013),
"ne" => U(0x2260),
"ni" => U(0x220B),
"not" => U(0xAC),
"notin" => U(0x2209),
"nsub" => U(0x2284),
"ntilde" => U(0xF1),
"nu" => U(0x03BD),
"oacute" => U(0xF3),
"ocirc" => U(0xF4),
"oelig" => U(0x0153),
"ograve" => U(0xF2),
"oline" => U(0x203E),
"omega" => U(0x03C9),
"omicron" => U(0x03BF),
"oplus" => U(0x2295),
"or" => U(0x2228),
"ordf" => U(0xAA),
"ordm" => U(0xBA),
"oslash" => U(0xF8),
"otilde" => U(0xF5),
"otimes" => U(0x2297),
"ouml" => U(0xF6),
"para" => U(0xB6),
"part" => U(0x2202),
"permil" => U(0x2030),
"perp" => U(0x22A5),
"phi" => U(0x03C6),
"pi" => U(0x03C0),
"piv" => U(0x03D6),
"plusmn" => U(0xB1),
"pound" => U(0xA3),
"prime" => U(0x2032),
"prod" => U(0x220F),
"prop" => U(0x221D),
"psi" => U(0x03C8),
"quot" => U(0x22),
"QUOT" => U(0x22),
"rArr" => U(0x21D2),
"radic" => U(0x221A),
"rang" => U(0x232A),
"raquo" => U(0xBB),
"rarr" => U(0x2192),
"rceil" => U(0x2309),
"rdquo" => U(0x201D),
"real" => U(0x211C),
"reg" => U(0xAE),
"REG" => U(0xAE),
"rfloor" => U(0x230B),
"rho" => U(0x03C1),
"rlm" => U(0x200F),
"rsaquo" => U(0x203A),
"rsquo" => U(0x2019),
"sbquo" => U(0x201A),
"scaron" => U(0x0161),
"sdot" => U(0x22C5),
"sect" => U(0xA7),
"shy" => U(0xAD),
"sigma" => U(0x03C3),
"sigmaf" => U(0x03C2),
"sim" => U(0x223C),
"spades" => U(0x2660),
"sub" => U(0x2282),
"sube" => U(0x2286),
"sum" => U(0x2211),
"sup" => U(0x2283),
"sup1" => U(0xB9),
"sup2" => U(0xB2),
"sup3" => U(0xB3),
"supe" => U(0x2287),
"szlig" => U(0xDF),
"tau" => U(0x03C4),
"there4" => U(0x2234),
"theta" => U(0x03B8),
"thetasym" => U(0x03D1),
"thinsp" => U(0x2009),
"thorn" => U(0xFE),
"tilde" => U(0x02DC),
"times" => U(0xD7),
"trade" => U(0x2122),
"uArr" => U(0x21D1),
"uacute" => U(0xFA),
"uarr" => U(0x2191),
"ucirc" => U(0xFB),
"ugrave" => U(0xF9),
"uml" => U(0xA8),
"upsih" => U(0x03D2),
"upsilon" => U(0x03C5),
"uuml" => U(0xFC),
"weierp" => U(0x2118),
"xi" => U(0x03BE),
"yacute" => U(0xFD),
"yen" => U(0xA5),
"yuml" => U(0xFF),
"zeta" => U(0x03B6),
"zwj" => U(0x200D),
"zwnj" => U(0x200C)
}
ENCODINGS = %w[
ansi_x3.4-1968
iso-ir-6
ansi_x3.4-1986
iso_646.irv:1991
ascii
iso646-us
us-ascii
us
ibm367
cp367
csascii
ks_c_5601-1987
korean
iso-2022-kr
csiso2022kr
euc-kr
iso-2022-jp
csiso2022jp
iso-2022-jp-2
iso-ir-58
chinese
csiso58gb231280
iso_8859-1:1987
iso-ir-100
iso_8859-1
iso-8859-1
latin1
l1
ibm819
cp819
csisolatin1
iso_8859-2:1987
iso-ir-101
iso_8859-2
iso-8859-2
latin2
l2
csisolatin2
iso_8859-3:1988
iso-ir-109
iso_8859-3
iso-8859-3
latin3
l3
csisolatin3
iso_8859-4:1988
iso-ir-110
iso_8859-4
iso-8859-4
latin4
l4
csisolatin4
iso_8859-6:1987
iso-ir-127
iso_8859-6
iso-8859-6
ecma-114
asmo-708
arabic
csisolatinarabic
iso_8859-7:1987
iso-ir-126
iso_8859-7
iso-8859-7
elot_928
ecma-118
greek
greek8
csisolatingreek
iso_8859-8:1988
iso-ir-138
iso_8859-8
iso-8859-8
hebrew
csisolatinhebrew
iso_8859-5:1988
iso-ir-144
iso_8859-5
iso-8859-5
cyrillic
csisolatincyrillic
iso_8859-9:1989
iso-ir-148
iso_8859-9
iso-8859-9
latin5
l5
csisolatin5
iso-8859-10
iso-ir-157
l6
iso_8859-10:1992
csisolatin6
latin6
hp-roman8
roman8
r8
ibm037
cp037
csibm037
ibm424
cp424
csibm424
ibm437
cp437
437
cspc8codepage437
ibm500
cp500
csibm500
ibm775
cp775
cspc775baltic
ibm850
cp850
850
cspc850multilingual
ibm852
cp852
852
cspcp852
ibm855
cp855
855
csibm855
ibm857
cp857
857
csibm857
ibm860
cp860
860
csibm860
ibm861
cp861
861
cp-is
csibm861
ibm862
cp862
862
cspc862latinhebrew
ibm863
cp863
863
csibm863
ibm864
cp864
csibm864
ibm865
cp865
865
csibm865
ibm866
cp866
866
csibm866
ibm869
cp869
869
cp-gr
csibm869
ibm1026
cp1026
csibm1026
koi8-r
cskoi8r
koi8-u
big5-hkscs
ptcp154
csptcp154
pt154
cp154
utf-7
utf-16be
utf-16le
utf-16
utf-8
iso-8859-13
iso-8859-14
iso-ir-199
iso_8859-14:1998
iso_8859-14
latin8
iso-celtic
l8
iso-8859-15
iso_8859-15
iso-8859-16
iso-ir-226
iso_8859-16:2001
iso_8859-16
latin10
l10
gbk
cp936
ms936
gb18030
shift_jis
ms_kanji
csshiftjis
euc-jp
gb2312
big5
csbig5
windows-1250
windows-1251
windows-1252
windows-1253
windows-1254
windows-1255
windows-1256
windows-1257
windows-1258
tis-620
hz-gb-2312
]
end

View file

@ -1 +0,0 @@
require 'html5lib/filters/optionaltags'

View file

@ -1,2 +0,0 @@
require 'html5lib/serializer/htmlserializer'
require 'html5lib/serializer/xhtmlserializer'

View file

@ -26,15 +26,15 @@ def parse(opts, args)
exit(1)
end
require 'html5lib/treebuilders'
treebuilder = HTML5lib::TreeBuilders[opts.treebuilder]
require 'html5/treebuilders'
treebuilder = HTML5::TreeBuilders[opts.treebuilder]
if opts.output == :xml
require 'html5lib/liberalxmlparser'
p = HTML5lib::XHTMLParser.new(:tree=>treebuilder)
require 'html5/liberalxmlparser'
p = HTML5::XHTMLParser.new(:tree=>treebuilder)
else
require 'html5lib/html5parser'
p = HTML5lib::HTMLParser.new(:tree=>treebuilder)
require 'html5/html5parser'
p = HTML5::HTMLParser.new(:tree=>treebuilder)
end
if opts.parsemethod == :parse
@ -70,10 +70,10 @@ def printOutput(parser, document, opts)
when :xml
print document
when :html
require 'html5lib/treewalkers'
tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
require 'html5lib/serializer'
puts HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer)
require 'html5/treewalkers'
tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
require 'html5/serializer'
puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
when :hilite
print document.hilite
when :tree
@ -188,6 +188,10 @@ opts = OptionParser.new do |opts|
options.serializer[:escape_lt_in_attrs] = lt
end
opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
options.serializer[:escape_rcdata] = rcdata
end
opts.separator ""
opts.separator "Other Options:"

View file

@ -33,7 +33,6 @@ EUC-jp
#encoding
EUC-jp
#data
<!-- -->
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">

View file

@ -92,7 +92,8 @@
{"description": "rcdata",
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a<b>c&d"]
"expected": ["<script>a<b>c&d"],
"xhtml": ["<script>a&lt;b&gt;c&amp;d"]
},
{"description": "doctype",

View file

@ -49,6 +49,12 @@
"options": {"escape_lt_in_attrs": true},
"input": [["StartTag", "a", {"title": "a<b>c&d"}]],
"expected": ["<a title=\"a&lt;b>c&amp;d\">"]
},
{"description": "rcdata",
"options": {"escape_rcdata": true},
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a&lt;b&gt;c&amp;d"]
}
]}

View file

@ -135,7 +135,7 @@
{"description":"Entity without trailing semicolon (2)",
"input":"I'm &notin",
"output":[["Character","I'm "], "ParseError", ["Character", ""]]},
"output":[["Character","I'm "], "ParseError", ["Character", "¬in"]]},
{"description":"Partial entity match at end of file",
"input":"I'm &no",
@ -151,6 +151,18 @@
{"description":"Hexadecimal entity in attribute",
"input":"<h a='&#x3f;'></h>",
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]}
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]},
{"description":"Entity in attribute without semicolon ending in x",
"input":"<h a='&notx'>",
"output":["ParseError", ["StartTag", "h", {"a":"&notx"}]]},
{"description":"Entity in attribute without semicolon ending in 1",
"input":"<h a='&not1'>",
"output":["ParseError", ["StartTag", "h", {"a":"&not1"}]]},
{"description":"Entity in attribute without semicolon",
"input":"<h a='&COPY'>",
"output":["ParseError", ["StartTag", "h", {"a":"©"}]]}
]}

View file

@ -42,19 +42,23 @@
{"description":"Numeric entity representing the NUL character",
"input":"&#0000;",
"output":[["Character", "\uFFFD"]]},
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing the NUL character",
"input":"&#x0000;",
"output":[["Character", "\uFFFD"]]},
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#2225222;",
"output":[["Character", "\uFFFD"]]},
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#x1010FFFF;",
"output":[["Character", "\uFFFD"]]},
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity pair representing a surrogate pair",
"input":"&#xD869;&#xDED6;",
"output":["ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"]]},
{"description":"Numeric entity representing a Windows-1252 'codepoint'",
"input":"&#137;",
@ -118,7 +122,7 @@
{"description":"Null Byte Replacement",
"input":"\u0000",
"output":[["Character", "\ufffd"]]}
"output":["ParseError", ["Character", "\ufffd"]]}
]}

View file

@ -285,6 +285,7 @@ Line1<br>Line2<br>Line3<br>Line4
| <div>
| <b>
| <marquee>
| <p>
| "X"
#data
@ -330,6 +331,7 @@ Unexpected end of file
| <body>
| <p>
| <hr>
| <p>
#data
<select><b><option><select><option></b></select>X
@ -1369,13 +1371,14 @@ unexpected EOF
<head></p><meta><p>
#errors
6: missing document type declaration
10: unexpected p element end tag in head
10: unexpected p element end tag
#document
| <html>
| <head>
| <meta>
| <body>
| <p>
| <meta>
| <p>
#data
<head></html><meta><p>
@ -1485,6 +1488,7 @@ unexpected EOF
| <div>
| <b>
| <marquee>
| <p>
#data
<script></script></div><title></title><p><p>
@ -1511,6 +1515,7 @@ unexpected EOF
| <body>
| <p>
| <hr>
| <p>
#data
<select><b><option><select><option></b></select>
@ -1807,6 +1812,7 @@ Unexpected EOF
| <head>
| <body>
| <br>
| <p>
#data
<table><tr></strong></b></em></i></u></strike></s></blink></tt></pre></big></small></font></select></h1></h2></h3></h4></h5></h6></body></br></a></img></title></span></style></script></table></th></td></tr></frame></area></link></param></hr></input></col></base></meta></basefont></bgsound></embed></spacer></p></dd></dt></caption></colgroup></tbody></tfoot></thead></address></blockquote></center></dir></div></dl></fieldset></listing></menu></ol></ul></li></nobr></wbr></form></button></marquee></object></html></frameset></head></iframe></image></isindex></noembed></noframes></noscript></optgroup></option></plaintext></textarea>
@ -1928,3 +1934,4 @@ Unexpected EOF
| <table>
| <tbody>
| <tr>
| <p>

View file

@ -777,3 +777,4 @@ Unexpected </p> end tag.
| <tbody>
| <tr>
| <td>
| <p>

View file

@ -61,7 +61,6 @@ No DOCTYPE
#data
<!DOCTYPE htML><html><head></head><body><pre>
foo</pre></body></html>
#errors
#document
@ -72,10 +71,22 @@ foo</pre></body></html>
| <pre>
| "foo"
#data
<!DOCTYPE htML><html><head></head><body><pre>
foo</pre></body></html>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <pre>
| "
foo"
#data
<!DOCTYPE htML><html><head></head><body><pre>
foo
</pre></body></html>
#errors
@ -183,7 +194,6 @@ y</pre></body></html>
#data
<!DOCTYPE htML><textarea>
foo</textarea>
#errors
#document
@ -194,6 +204,20 @@ foo</textarea>
| <textarea>
| "foo"
#data
<!DOCTYPE htML><textarea>
foo</textarea>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <textarea>
| "
foo"
#data
<!DOCTYPE htML><html><head></head><body><ul><li><div><p><li></ul></body></html>
#errors

View file

@ -1,37 +1,49 @@
#data
direct div content
#errors
#document-fragment div
#document-fragment
div
#document
| "direct div content"
#data
direct textarea content
#errors
#document-fragment textarea
#document-fragment
textarea
#document
| "direct textarea content"
#data
textarea content with <em>pseudo</em> <foo>markup
#errors
#document-fragment textarea
#document-fragment
textarea
#document
| "textarea content with <em>pseudo</em> <foo>markup"
#data
this is &#x0043;DATA inside a <style> element
#errors
#document-fragment style
#document-fragment
style
#document
| "this is &#x0043;DATA inside a <style> element"
#data
</plaintext>
#errors
#document-fragment plaintext
#document-fragment
plaintext
#document
| "</plaintext>"
#data
setting html's innerHTML
#errors
#document-fragment html
#document-fragment
html
#document
| <head>
| <body>
| "setting html's innerHTML"
@ -39,6 +51,8 @@ setting html's innerHTML
#data
<title>setting head's innerHTML</title>
#errors
#document-fragment head
#document-fragment
head
#document
| <title>
| "setting head's innerHTML"

View file

@ -27,3 +27,41 @@
| <head>
| <body>
| <meta>
#data
<!doctype HTml><form><div></form><div>
#errors
Form end tag ignored.
Unexpected end of file.
#document
| <!DOCTYPE HTml>
| <html>
| <head>
| <body>
| <form>
| <div>
| <div>
#data
<!doctype HTml><title>&amp;</title>
#errors
Unexpected end of file.
#document
| <!DOCTYPE HTml>
| <html>
| <head>
| <title>
| "&"
| <body>
#data
<!doctype HTml><title><!--&amp;--></title>
#errors
Unexpected end of file.
#document
| <!DOCTYPE HTml>
| <html>
| <head>
| <title>
| "<!--&amp;-->"
| <body>

View file

@ -1,81 +1,81 @@
require 'test/unit'
HTML5LIB_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
if File.exists?(File.join(HTML5LIB_BASE, 'testdata'))
TESTDATA_DIR = File.join(HTML5LIB_BASE, 'testdata')
else
TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
end
$:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
$:.unshift File.dirname(__FILE__)
def html5lib_test_files(subdirectory)
Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
end
begin
require 'rubygems'
require 'json'
rescue LoadError
class JSON
def self.parse json
json.gsub!(/"\s*:/, '"=>')
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
null = nil
eval json
end
end
end
module HTML5lib
module TestSupport
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
match.split("\n").sort.join("\n")
end
end
end
end
require 'test/unit'
HTML5_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
if File.exists?(File.join(HTML5_BASE, 'testdata'))
TESTDATA_DIR = File.join(HTML5_BASE, 'testdata')
else
TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
end
$:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
$:.unshift File.dirname(__FILE__)
def html5_test_files(subdirectory)
Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
end
begin
require 'rubygems'
require 'json'
rescue LoadError
class JSON
def self.parse json
json.gsub!(/"\s*:/, '"=>')
json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
null = nil
eval json
end
end
end
module HTML5
module TestSupport
# convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
end
def sortattrs(output)
output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
match.split("\n").sort.join("\n")
end
end
class TestData
include Enumerable
def initialize(filename, sections)
@f = open(filename)
@sections = sections
end
def each
data = {}
key=nil
@f.each_line do |line|
if line[0] == ?# and @sections.include?(line[1..-2])
heading = line[1..-2]
if data.any? and heading == @sections[0]
data[key].chomp! #Remove trailing newline
yield normaliseOutput(data)
data = {}
end
key = heading
data[key]=""
elsif key
data[key] += line
end
end
yield normaliseOutput(data) if data
end
def normaliseOutput(data)
#Remove trailing newlines
data.keys.each { |key| data[key].chomp! }
@sections.map {|heading| data[heading]}
end
end
end
end

View file

@ -1,8 +1,10 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/inputstream'
require 'html5/inputstream'
class Html5EncodingTestCase < Test::Unit::TestCase
include HTML5
include TestSupport
begin
require 'rubygems'
@ -10,23 +12,21 @@ class Html5EncodingTestCase < Test::Unit::TestCase
def test_chardet
file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r')
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true)
stream = HTML5::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.char_encoding.downcase
rescue LoadError
puts "chardet not found, skipping chardet tests"
end
end
html5lib_test_files('encoding').each do |test_file|
html5_test_files('encoding').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
input, encoding = data.split(/\n#encoding\s+/, 2)
encoding = encoding.split[0]
TestData.new(test_file, %w(data encoding)).
each_with_index do |(input, encoding), index|
define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false)
stream = HTML5::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.char_encoding.downcase, input
end
end

View file

@ -1,23 +1,23 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/liberalxmlparser'
require 'html5/liberalxmlparser'
XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
SORTATTRS = '<#{$1+$2.split.sort.join(' ')+$3}>'
def assert_xml_equal(input, expected=nil, parser=HTML5lib::XMLParser)
def assert_xml_equal(input, expected=nil, parser=HTML5::XMLParser)
sortattrs = proc {"<#{$1+$2.split.sort.join(' ')+$3}>"}
document = parser.parse(input.chomp).root
if not expected
expected = input.chomp.gsub(XMLELEM,SORTATTRS)
expected = input.chomp.gsub(XMLELEM,&sortattrs)
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS)
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,&sortattrs)
assert_equal(expected, output)
else
assert_equal(expected, document.to_s.gsub(/'/,'"'))
end
end
def assert_xhtml_equal(input, expected=nil, parser=HTML5lib::XHTMLParser)
def assert_xhtml_equal(input, expected=nil, parser=HTML5::XHTMLParser)
assert_xml_equal(input, expected, parser)
end
@ -34,10 +34,10 @@ class BasicXhtml5Test < Test::Unit::TestCase
def test_title_body_named_charref
assert_xhtml_equal(
'<title>mdash</title>A &mdash B',
'<title>ntilde</title>A &ntilde B',
'<html xmlns="http://www.w3.org/1999/xhtml">' +
'<head><title>mdash</title></head>' +
'<body>A '+ [0x2014].pack('U') + ' B</body>' +
'<head><title>ntilde</title></head>' +
'<body>A '+ [0xF1].pack('U') + ' B</body>' +
'</html>')
end
end
@ -193,20 +193,71 @@ EOX
def test_br
assert_xhtml_equal <<EOX1
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>XLINK</title></head>
<head><title>BR</title></head>
<body>
<br/>
</body></html>
EOX1
end
def xtest_strong
def test_strong
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>XLINK</title></head>
<head><title>STRONG</title></head>
<body>
<strong></strong>
</body></html>
EOX
end
def test_script
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>SCRIPT</title></head>
<body>
<script>1 &lt; 2 &amp; 3</script>
</body></html>
EOX
end
def test_script_src
assert_xhtml_equal <<EOX1, <<EOX2.strip
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>SCRIPT</title><script src="http://example.com"/></head>
<body>
<script>1 &lt; 2 &amp; 3</script>
</body></html>
EOX1
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>SCRIPT</title><script src="http://example.com"></script></head>
<body>
<script>1 &lt; 2 &amp; 3</script>
</body></html>
EOX2
end
def test_title
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>1 &lt; 2 &amp; 3</title></head>
<body>
</body></html>
EOX
end
def test_prolog
assert_xhtml_equal <<EOX1, <<EOX2.strip
<?xml version="1.0" encoding="UTF-8" ?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>PROLOG</title></head>
<body>
</body></html>
EOX1
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>PROLOG</title></head>
<body>
</body></html>
EOX2
end
end

View file

@ -1,7 +1,7 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/treebuilders'
require 'html5lib/html5parser'
require 'html5/treebuilders'
require 'html5/html5parser'
$tree_types_to_test = ['simpletree', 'rexml']
@ -18,18 +18,17 @@ puts 'Testing tree builders: ' + $tree_types_to_test * ', '
class Html5ParserTestCase < Test::Unit::TestCase
include HTML5lib
include HTML5
include TestSupport
html5lib_test_files('tree-construction').each do |test_file|
html5_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '')
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
innerHTML, input, expected_output, expected_errors =
TestSupport.parseTestcase(data)
TestData.new(test_file, %w(data errors document-fragment document)).
each_with_index do |(input, errors, innerHTML, expected), index|
expected = expected.gsub("\n| ","\n")[2..-1]
$tree_types_to_test.each do |tree_name|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
@ -44,9 +43,9 @@ class Html5ParserTestCase < Test::Unit::TestCase
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
assert_equal sortattrs(expected_output), sortattrs(actual_output), [
assert_equal sortattrs(expected), sortattrs(actual_output), [
'', 'Input:', input,
'', 'Expected:', expected_output,
'', 'Expected:', expected,
'', 'Recieved:', actual_output
].join("\n")
@ -54,9 +53,9 @@ class Html5ParserTestCase < Test::Unit::TestCase
actual_errors = parser.errors.map do |(line, col), message|
'Line: %i Col: %i %s' % [line, col, message]
end
assert_equal expected_errors.length, parser.errors.length, [
assert_equal errors.length, parser.errors.length, [
'Input', input + "\n",
'Expected errors:', expected_errors.join("\n"),
'Expected errors:', errors.join("\n"),
'Actual errors:', actual_errors.join("\n")
].join("\n")
end

View file

@ -2,14 +2,14 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser'
require 'html5lib/liberalxmlparser'
require 'html5lib/treewalkers'
require 'html5lib/serializer'
require 'html5lib/sanitizer'
require 'html5/html5parser'
require 'html5/liberalxmlparser'
require 'html5/treewalkers'
require 'html5/serializer'
require 'html5/sanitizer'
class SanitizeTest < Test::Unit::TestCase
include HTML5lib
include HTML5
def sanitize_xhtml stream
XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).to_s
@ -131,7 +131,7 @@ class SanitizeTest < Test::Unit::TestCase
# check_sanitization(input, output, output, output)
# end
html5lib_test_files('sanitizer').each do |filename|
html5_test_files('sanitizer').each do |filename|
JSON::parse(open(filename).read).each do |test|
define_method "test_#{test['name']}" do
check_sanitization(

View file

@ -1,13 +1,13 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser'
require 'html5lib/serializer'
require 'html5lib/treewalkers'
require 'html5/html5parser'
require 'html5/serializer'
require 'html5/treewalkers'
#Run the serialize error checks
checkSerializeErrors = false
class JsonWalker < HTML5lib::TreeWalkers::Base
class JsonWalker < HTML5::TreeWalkers::Base
def each
@tree.each do |token|
case token[0]
@ -31,7 +31,7 @@ class JsonWalker < HTML5lib::TreeWalkers::Base
end
class Html5SerializeTestcase < Test::Unit::TestCase
html5lib_test_files('serializer').each do |filename|
html5_test_files('serializer').each do |filename|
test_name = File.basename(filename).sub('.test', '')
tests = JSON::parse(open(filename).read)
tests['tests'].each_with_index do |test, index|
@ -41,7 +41,7 @@ class Html5SerializeTestcase < Test::Unit::TestCase
test["options"][:encoding] = test["options"]["encoding"]
end
result = HTML5lib::HTMLSerializer.
result = HTML5::HTMLSerializer.
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
expected = test["expected"]
if expected.length == 1
@ -52,7 +52,7 @@ class Html5SerializeTestcase < Test::Unit::TestCase
return if test_name == 'optionaltags'
result = HTML5lib::XHTMLSerializer.
result = HTML5::XHTMLSerializer.
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
expected = test["xhtml"] || test["expected"]
if expected.length == 1

View file

@ -1,9 +1,9 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/inputstream'
require 'html5/inputstream'
class HTMLInputStreamTest < Test::Unit::TestCase
include HTML5lib
include HTML5
def test_char_ascii
stream = HTMLInputStream.new("'", :encoding=>'ascii')

View file

@ -1,6 +1,6 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/tokenizer'
require 'html5/tokenizer'
require 'tokenizer_test_parser'
@ -36,7 +36,7 @@ class Html5TokenizerTestCase < Test::Unit::TestCase
'' ] * "\n"
assert_nothing_raised message do
tokenizer = HTML5lib::HTMLTokenizer.new(data['input'])
tokenizer = HTML5::HTMLTokenizer.new(data['input'])
tokenizer.contentModelFlag = content_model_flag.to_sym
@ -53,7 +53,7 @@ class Html5TokenizerTestCase < Test::Unit::TestCase
end
end
html5lib_test_files('tokenizer').each do |test_file|
html5_test_files('tokenizer').each do |test_file|
test_name = File.basename(test_file).sub('.test', '')
tests = JSON.parse(File.read(test_file))['tests']

View file

@ -1,25 +1,25 @@
require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser'
require 'html5lib/treewalkers'
require 'html5lib/treebuilders'
require 'html5/html5parser'
require 'html5/treewalkers'
require 'html5/treebuilders'
$tree_types_to_test = {
'simpletree' =>
{:builder => HTML5lib::TreeBuilders['simpletree'],
:walker => HTML5lib::TreeWalkers['simpletree']},
{:builder => HTML5::TreeBuilders['simpletree'],
:walker => HTML5::TreeWalkers['simpletree']},
'rexml' =>
{:builder => HTML5lib::TreeBuilders['rexml'],
:walker => HTML5lib::TreeWalkers['rexml']},
{:builder => HTML5::TreeBuilders['rexml'],
:walker => HTML5::TreeWalkers['rexml']},
'hpricot' =>
{:builder => HTML5lib::TreeBuilders['hpricot'],
:walker => HTML5lib::TreeWalkers['hpricot']},
{:builder => HTML5::TreeBuilders['hpricot'],
:walker => HTML5::TreeWalkers['hpricot']},
}
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
class TestTreeWalkers < Test::Unit::TestCase
include HTML5lib::TestSupport
include HTML5::TestSupport
def concatenateCharacterTokens(tokens)
charactersToken = nil
@ -70,22 +70,21 @@ class TestTreeWalkers < Test::Unit::TestCase
return output.join("\n")
end
html5lib_test_files('tree-construction').each do |test_file|
html5_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '')
next if test_name == 'tests5' # TODO
File.read(test_file).split("#data\n").each_with_index do |data, index|
next if data.empty?
TestData.new(test_file, %w(data errors document-fragment document)).
each_with_index do |(input, errors, innerHTML, expected), index|
innerHTML, input, expected_output, expected_errors =
HTML5lib::TestSupport::parseTestcase(data)
expected = expected.gsub("\n| ","\n")[2..-1]
$tree_types_to_test.each do |tree_name, tree_class|
define_method "test_#{test_name}_#{index}_#{tree_name}" do
parser = HTML5lib::HTMLParser.new(:tree => tree_class[:builder])
parser = HTML5::HTMLParser.new(:tree => tree_class[:builder])
if innerHTML
parser.parseFragment(input, innerHTML)
@ -97,7 +96,7 @@ class TestTreeWalkers < Test::Unit::TestCase
begin
output = sortattrs(convertTokens(tree_class[:walker].new(document)))
expected = sortattrs(expected_output)
expected = sortattrs(expected)
assert_equal expected, output, [
'', 'Input:', input,
'', 'Expected:', expected,

View file

@ -1,63 +1,63 @@
require 'html5lib/constants'
class TokenizerTestParser
def initialize(tokenizer)
@tokenizer = tokenizer
end
def parse
@outputTokens = []
debug = nil
for token in @tokenizer
debug = token.inspect if token[:type] == :ParseError
send(('process' + token[:type].to_s), token)
end
return @outputTokens
end
def processDoctype(token)
@outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
token[:systemId], token[:correct]])
end
def processStartTag(token)
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processEmptyTag(token)
if not HTML5lib::VOID_ELEMENTS.include? token[:name]
@outputTokens.push("ParseError")
end
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processEndTag(token)
if token[:data].length > 0
self.processParseError(token)
end
@outputTokens.push(["EndTag", token[:name]])
end
def processComment(token)
@outputTokens.push(["Comment", token[:data]])
end
def processCharacters(token)
@outputTokens.push(["Character", token[:data]])
end
alias processSpaceCharacters processCharacters
def processCharacters(token)
@outputTokens.push(["Character", token[:data]])
end
def processEOF(token)
end
def processParseError(token)
@outputTokens.push("ParseError")
end
end
require 'html5/constants'
class TokenizerTestParser
def initialize(tokenizer)
@tokenizer = tokenizer
end
def parse
@outputTokens = []
debug = nil
for token in @tokenizer
debug = token.inspect if token[:type] == :ParseError
send(('process' + token[:type].to_s), token)
end
return @outputTokens
end
def processDoctype(token)
@outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
token[:systemId], token[:correct]])
end
def processStartTag(token)
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processEmptyTag(token)
if not HTML5::VOID_ELEMENTS.include? token[:name]
@outputTokens.push("ParseError")
end
@outputTokens.push(["StartTag", token[:name], token[:data]])
end
def processEndTag(token)
if token[:data].length > 0
self.processParseError(token)
end
@outputTokens.push(["EndTag", token[:name]])
end
def processComment(token)
@outputTokens.push(["Comment", token[:data]])
end
def processCharacters(token)
@outputTokens.push(["Character", token[:data]])
end
alias processSpaceCharacters processCharacters
def processCharacters(token)
@outputTokens.push(["Character", token[:data]])
end
def processEOF(token)
end
def processParseError(token)
@outputTokens.push("ParseError")
end
end

View file

@ -28,6 +28,7 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
class LineSource
include MaRuKu::Strings
attr_reader :parent
def initialize(lines, parent=nil, parent_offset=nil)
raise "NIL lines? " if not lines

View file

@ -65,22 +65,8 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
when :ald
output.push read_ald(src)
when :text
if src.cur_line =~ MightBeTableHeader and
(src.next_line && src.next_line =~ TableSeparator)
output.push read_table(src)
elsif [:header1,:header2].include? src.next_line.md_type
output.push read_header12(src)
elsif eventually_comes_a_def_list(src)
definition = read_definition(src)
if output.last.kind_of?(MDElement) &&
output.last.node_type == :definition_list then
output.last.children << definition
else
output.push md_el(:definition_list, [definition])
end
else # Start of a paragraph
output.push read_paragraph(src)
end
# paragraph, or table, or definition list
read_text_material(src, output)
when :header2, :hrule
# hrule
src.shift_line
@ -102,7 +88,12 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
when :raw_html; e = read_raw_html(src); output << e if e
when :footnote_text; output.push read_footnote_text(src)
when :ref_definition; read_ref_definition(src, output)
when :ref_definition;
if src.parent && (src.cur_index == 0)
read_text_material(src, output)
else
read_ref_definition(src, output)
end
when :abbreviation; output.push read_abbreviation(src)
when :xml_instr; read_xml_instruction(src, output)
when :metadata;
@ -149,6 +140,24 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
output
end
def read_text_material(src, output)
if src.cur_line =~ MightBeTableHeader and
(src.next_line && src.next_line =~ TableSeparator)
output.push read_table(src)
elsif [:header1,:header2].include? src.next_line.md_type
output.push read_header12(src)
elsif eventually_comes_a_def_list(src)
definition = read_definition(src)
if output.last.kind_of?(MDElement) &&
output.last.node_type == :definition_list then
output.last.children << definition
else
output.push md_el(:definition_list, [definition])
end
else # Start of a paragraph
output.push read_paragraph(src)
end
end
def read_ald(src)
@ -274,9 +283,9 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
item_type = src.cur_line.md_type
first = src.shift_line
# Ugly things going on inside `read_indented_content`
indentation = spaces_before_first_char(first)
break_list = [:ulist, :olist, :ial]
# Ugly things going on inside `read_indented_content`
lines, want_my_paragraph =
read_indented_content(src,indentation, break_list, item_type)
@ -285,7 +294,7 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
stripped = first[indentation, first.size-1]
lines.unshift stripped
#dbg_describe_ary(lines, 'LIST ITEM ')
# dbg_describe_ary(lines, 'LIST ITEM ')
src2 = LineSource.new(lines, src, parent_offset)
children = parse_blocks(src2)