Sync with latest HTML5lib and latest Maruku

This commit is contained in:
Jacques Distler 2007-07-04 17:36:59 -05:00
parent 8e92e4a3ab
commit 8ccaad85a5
71 changed files with 1974 additions and 1621 deletions

View file

@ -25,14 +25,14 @@
module Sanitize module Sanitize
require 'html5lib/html5parser' require 'html5/html5parser'
require 'html5lib/liberalxmlparser' require 'html5/liberalxmlparser'
require 'html5lib/treewalkers' require 'html5/treewalkers'
require 'html5lib/treebuilders' require 'html5/treebuilders'
require 'html5lib/serializer' require 'html5/serializer'
require 'html5lib/sanitizer' require 'html5/sanitizer'
include HTML5lib include HTML5
# Sanitize a string, parsed using XHTML parsing rules. # Sanitize a string, parsed using XHTML parsing rules.
# #

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser' require 'html5/html5parser'
module HTML5lib module HTML5
def self.parse(stream, options={}) def self.parse(stream, options={})
HTMLParser.parse(stream, options) HTMLParser.parse(stream, options)
end end

817
vendor/plugins/HTML5lib/lib/html5/constants.rb vendored Executable file
View file

@ -0,0 +1,817 @@
module HTML5
class EOF < Exception; end
CONTENT_MODEL_FLAGS = [
:PCDATA,
:RCDATA,
:CDATA,
:PLAINTEXT
]
SCOPING_ELEMENTS = %w[
button
caption
html
marquee
object
table
td
th
]
FORMATTING_ELEMENTS = %w[
a
b
big
em
font
i
nobr
s
small
strike
strong
tt
u
]
SPECIAL_ELEMENTS = %w[
address
area
base
basefont
bgsound
blockquote
body
br
center
col
colgroup
dd
dir
div
dl
dt
embed
fieldset
form
frame
frameset
h1
h2
h3
h4
h5
h6
head
hr
iframe
image
img
input
isindex
li
link
listing
menu
meta
noembed
noframes
noscript
ol
optgroup
option
p
param
plaintext
pre
script
select
spacer
style
tbody
textarea
tfoot
thead
title
tr
ul
wbr
]
SPACE_CHARACTERS = %W[
\t
\n
\x0B
\x0C
\x20
\r
]
TABLE_INSERT_MODE_ELEMENTS = %w[
table
tbody
tfoot
thead
tr
]
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
DIGITS = '0'..'9'
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
# Heading elements need to be ordered
HEADING_ELEMENTS = %w[
h1
h2
h3
h4
h5
h6
]
# XXX What about event-source and command?
VOID_ELEMENTS = %w[
base
link
meta
hr
br
img
embed
param
area
col
input
]
CDATA_ELEMENTS = %w[title textarea]
RCDATA_ELEMENTS = %w[
style
script
xmp
iframe
noembed
noframes
noscript
]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],
'img' => %w[ismap],
'audio' => %w[autoplay controls],
'video' => %w[autoplay controls],
'script' => %w[defer async],
'details' => %w[open],
'datagrid' => %w[multiple disabled],
'command' => %w[hidden disabled checked default],
'menu' => %w[autosubmit],
'fieldset' => %w[disabled readonly],
'option' => %w[disabled readonly selected],
'optgroup' => %w[disabled readonly],
'button' => %w[disabled autofocus],
'input' => %w[disabled readonly required autofocus checked ismap],
'select' => %w[disabled readonly autofocus multiple],
'output' => %w[disabled readonly]
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN
65533, # 0x81 UNDEFINED
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
8224, # 0x86 0x2020 DAGGER
8225, # 0x87 0x2021 DOUBLE DAGGER
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
8240, # 0x89 0x2030 PER MILLE SIGN
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
65533, # 0x8D UNDEFINED
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
65533, # 0x8F UNDEFINED
65533, # 0x90 UNDEFINED
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
8226, # 0x95 0x2022 BULLET
8211, # 0x96 0x2013 EN DASH
8212, # 0x97 0x2014 EM DASH
732, # 0x98 0x02DC SMALL TILDE
8482, # 0x99 0x2122 TRADE MARK SIGN
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
65533, # 0x9D UNDEFINED
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
]
# ENTITIES was generated from Python using the following code:
#
# import constants
# entities = constants.entities.items()
# entities.sort()
# list = [ ' '.join([repr(entity), '=>', ord(value)<128 and
# repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
# for entity, value in entities]
# print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }'
ENTITIES = {
'AElig' => "\xc3\x86",
'AElig;' => "\xc3\x86",
'AMP' => '&',
'AMP;' => '&',
'Aacute' => "\xc3\x81",
'Aacute;' => "\xc3\x81",
'Acirc' => "\xc3\x82",
'Acirc;' => "\xc3\x82",
'Agrave' => "\xc3\x80",
'Agrave;' => "\xc3\x80",
'Alpha;' => "\xce\x91",
'Aring' => "\xc3\x85",
'Aring;' => "\xc3\x85",
'Atilde' => "\xc3\x83",
'Atilde;' => "\xc3\x83",
'Auml' => "\xc3\x84",
'Auml;' => "\xc3\x84",
'Beta;' => "\xce\x92",
'COPY' => "\xc2\xa9",
'COPY;' => "\xc2\xa9",
'Ccedil' => "\xc3\x87",
'Ccedil;' => "\xc3\x87",
'Chi;' => "\xce\xa7",
'Dagger;' => "\xe2\x80\xa1",
'Delta;' => "\xce\x94",
'ETH' => "\xc3\x90",
'ETH;' => "\xc3\x90",
'Eacute' => "\xc3\x89",
'Eacute;' => "\xc3\x89",
'Ecirc' => "\xc3\x8a",
'Ecirc;' => "\xc3\x8a",
'Egrave' => "\xc3\x88",
'Egrave;' => "\xc3\x88",
'Epsilon;' => "\xce\x95",
'Eta;' => "\xce\x97",
'Euml' => "\xc3\x8b",
'Euml;' => "\xc3\x8b",
'GT' => '>',
'GT;' => '>',
'Gamma;' => "\xce\x93",
'Iacute' => "\xc3\x8d",
'Iacute;' => "\xc3\x8d",
'Icirc' => "\xc3\x8e",
'Icirc;' => "\xc3\x8e",
'Igrave' => "\xc3\x8c",
'Igrave;' => "\xc3\x8c",
'Iota;' => "\xce\x99",
'Iuml' => "\xc3\x8f",
'Iuml;' => "\xc3\x8f",
'Kappa;' => "\xce\x9a",
'LT' => '<',
'LT;' => '<',
'Lambda;' => "\xce\x9b",
'Mu;' => "\xce\x9c",
'Ntilde' => "\xc3\x91",
'Ntilde;' => "\xc3\x91",
'Nu;' => "\xce\x9d",
'OElig;' => "\xc5\x92",
'Oacute' => "\xc3\x93",
'Oacute;' => "\xc3\x93",
'Ocirc' => "\xc3\x94",
'Ocirc;' => "\xc3\x94",
'Ograve' => "\xc3\x92",
'Ograve;' => "\xc3\x92",
'Omega;' => "\xce\xa9",
'Omicron;' => "\xce\x9f",
'Oslash' => "\xc3\x98",
'Oslash;' => "\xc3\x98",
'Otilde' => "\xc3\x95",
'Otilde;' => "\xc3\x95",
'Ouml' => "\xc3\x96",
'Ouml;' => "\xc3\x96",
'Phi;' => "\xce\xa6",
'Pi;' => "\xce\xa0",
'Prime;' => "\xe2\x80\xb3",
'Psi;' => "\xce\xa8",
'QUOT' => '"',
'QUOT;' => '"',
'REG' => "\xc2\xae",
'REG;' => "\xc2\xae",
'Rho;' => "\xce\xa1",
'Scaron;' => "\xc5\xa0",
'Sigma;' => "\xce\xa3",
'THORN' => "\xc3\x9e",
'THORN;' => "\xc3\x9e",
'TRADE;' => "\xe2\x84\xa2",
'Tau;' => "\xce\xa4",
'Theta;' => "\xce\x98",
'Uacute' => "\xc3\x9a",
'Uacute;' => "\xc3\x9a",
'Ucirc' => "\xc3\x9b",
'Ucirc;' => "\xc3\x9b",
'Ugrave' => "\xc3\x99",
'Ugrave;' => "\xc3\x99",
'Upsilon;' => "\xce\xa5",
'Uuml' => "\xc3\x9c",
'Uuml;' => "\xc3\x9c",
'Xi;' => "\xce\x9e",
'Yacute' => "\xc3\x9d",
'Yacute;' => "\xc3\x9d",
'Yuml;' => "\xc5\xb8",
'Zeta;' => "\xce\x96",
'aacute' => "\xc3\xa1",
'aacute;' => "\xc3\xa1",
'acirc' => "\xc3\xa2",
'acirc;' => "\xc3\xa2",
'acute' => "\xc2\xb4",
'acute;' => "\xc2\xb4",
'aelig' => "\xc3\xa6",
'aelig;' => "\xc3\xa6",
'agrave' => "\xc3\xa0",
'agrave;' => "\xc3\xa0",
'alefsym;' => "\xe2\x84\xb5",
'alpha;' => "\xce\xb1",
'amp' => '&',
'amp;' => '&',
'and;' => "\xe2\x88\xa7",
'ang;' => "\xe2\x88\xa0",
'apos;' => "'",
'aring' => "\xc3\xa5",
'aring;' => "\xc3\xa5",
'asymp;' => "\xe2\x89\x88",
'atilde' => "\xc3\xa3",
'atilde;' => "\xc3\xa3",
'auml' => "\xc3\xa4",
'auml;' => "\xc3\xa4",
'bdquo;' => "\xe2\x80\x9e",
'beta;' => "\xce\xb2",
'brvbar' => "\xc2\xa6",
'brvbar;' => "\xc2\xa6",
'bull;' => "\xe2\x80\xa2",
'cap;' => "\xe2\x88\xa9",
'ccedil' => "\xc3\xa7",
'ccedil;' => "\xc3\xa7",
'cedil' => "\xc2\xb8",
'cedil;' => "\xc2\xb8",
'cent' => "\xc2\xa2",
'cent;' => "\xc2\xa2",
'chi;' => "\xcf\x87",
'circ;' => "\xcb\x86",
'clubs;' => "\xe2\x99\xa3",
'cong;' => "\xe2\x89\x85",
'copy' => "\xc2\xa9",
'copy;' => "\xc2\xa9",
'crarr;' => "\xe2\x86\xb5",
'cup;' => "\xe2\x88\xaa",
'curren' => "\xc2\xa4",
'curren;' => "\xc2\xa4",
'dArr;' => "\xe2\x87\x93",
'dagger;' => "\xe2\x80\xa0",
'darr;' => "\xe2\x86\x93",
'deg' => "\xc2\xb0",
'deg;' => "\xc2\xb0",
'delta;' => "\xce\xb4",
'diams;' => "\xe2\x99\xa6",
'divide' => "\xc3\xb7",
'divide;' => "\xc3\xb7",
'eacute' => "\xc3\xa9",
'eacute;' => "\xc3\xa9",
'ecirc' => "\xc3\xaa",
'ecirc;' => "\xc3\xaa",
'egrave' => "\xc3\xa8",
'egrave;' => "\xc3\xa8",
'empty;' => "\xe2\x88\x85",
'emsp;' => "\xe2\x80\x83",
'ensp;' => "\xe2\x80\x82",
'epsilon;' => "\xce\xb5",
'equiv;' => "\xe2\x89\xa1",
'eta;' => "\xce\xb7",
'eth' => "\xc3\xb0",
'eth;' => "\xc3\xb0",
'euml' => "\xc3\xab",
'euml;' => "\xc3\xab",
'euro;' => "\xe2\x82\xac",
'exist;' => "\xe2\x88\x83",
'fnof;' => "\xc6\x92",
'forall;' => "\xe2\x88\x80",
'frac12' => "\xc2\xbd",
'frac12;' => "\xc2\xbd",
'frac14' => "\xc2\xbc",
'frac14;' => "\xc2\xbc",
'frac34' => "\xc2\xbe",
'frac34;' => "\xc2\xbe",
'frasl;' => "\xe2\x81\x84",
'gamma;' => "\xce\xb3",
'ge;' => "\xe2\x89\xa5",
'gt' => '>',
'gt;' => '>',
'hArr;' => "\xe2\x87\x94",
'harr;' => "\xe2\x86\x94",
'hearts;' => "\xe2\x99\xa5",
'hellip;' => "\xe2\x80\xa6",
'iacute' => "\xc3\xad",
'iacute;' => "\xc3\xad",
'icirc' => "\xc3\xae",
'icirc;' => "\xc3\xae",
'iexcl' => "\xc2\xa1",
'iexcl;' => "\xc2\xa1",
'igrave' => "\xc3\xac",
'igrave;' => "\xc3\xac",
'image;' => "\xe2\x84\x91",
'infin;' => "\xe2\x88\x9e",
'int;' => "\xe2\x88\xab",
'iota;' => "\xce\xb9",
'iquest' => "\xc2\xbf",
'iquest;' => "\xc2\xbf",
'isin;' => "\xe2\x88\x88",
'iuml' => "\xc3\xaf",
'iuml;' => "\xc3\xaf",
'kappa;' => "\xce\xba",
'lArr;' => "\xe2\x87\x90",
'lambda;' => "\xce\xbb",
'lang;' => "\xe3\x80\x88",
'laquo' => "\xc2\xab",
'laquo;' => "\xc2\xab",
'larr;' => "\xe2\x86\x90",
'lceil;' => "\xe2\x8c\x88",
'ldquo;' => "\xe2\x80\x9c",
'le;' => "\xe2\x89\xa4",
'lfloor;' => "\xe2\x8c\x8a",
'lowast;' => "\xe2\x88\x97",
'loz;' => "\xe2\x97\x8a",
'lrm;' => "\xe2\x80\x8e",
'lsaquo;' => "\xe2\x80\xb9",
'lsquo;' => "\xe2\x80\x98",
'lt' => '<',
'lt;' => '<',
'macr' => "\xc2\xaf",
'macr;' => "\xc2\xaf",
'mdash;' => "\xe2\x80\x94",
'micro' => "\xc2\xb5",
'micro;' => "\xc2\xb5",
'middot' => "\xc2\xb7",
'middot;' => "\xc2\xb7",
'minus;' => "\xe2\x88\x92",
'mu;' => "\xce\xbc",
'nabla;' => "\xe2\x88\x87",
'nbsp' => "\xc2\xa0",
'nbsp;' => "\xc2\xa0",
'ndash;' => "\xe2\x80\x93",
'ne;' => "\xe2\x89\xa0",
'ni;' => "\xe2\x88\x8b",
'not' => "\xc2\xac",
'not;' => "\xc2\xac",
'notin;' => "\xe2\x88\x89",
'nsub;' => "\xe2\x8a\x84",
'ntilde' => "\xc3\xb1",
'ntilde;' => "\xc3\xb1",
'nu;' => "\xce\xbd",
'oacute' => "\xc3\xb3",
'oacute;' => "\xc3\xb3",
'ocirc' => "\xc3\xb4",
'ocirc;' => "\xc3\xb4",
'oelig;' => "\xc5\x93",
'ograve' => "\xc3\xb2",
'ograve;' => "\xc3\xb2",
'oline;' => "\xe2\x80\xbe",
'omega;' => "\xcf\x89",
'omicron;' => "\xce\xbf",
'oplus;' => "\xe2\x8a\x95",
'or;' => "\xe2\x88\xa8",
'ordf' => "\xc2\xaa",
'ordf;' => "\xc2\xaa",
'ordm' => "\xc2\xba",
'ordm;' => "\xc2\xba",
'oslash' => "\xc3\xb8",
'oslash;' => "\xc3\xb8",
'otilde' => "\xc3\xb5",
'otilde;' => "\xc3\xb5",
'otimes;' => "\xe2\x8a\x97",
'ouml' => "\xc3\xb6",
'ouml;' => "\xc3\xb6",
'para' => "\xc2\xb6",
'para;' => "\xc2\xb6",
'part;' => "\xe2\x88\x82",
'permil;' => "\xe2\x80\xb0",
'perp;' => "\xe2\x8a\xa5",
'phi;' => "\xcf\x86",
'pi;' => "\xcf\x80",
'piv;' => "\xcf\x96",
'plusmn' => "\xc2\xb1",
'plusmn;' => "\xc2\xb1",
'pound' => "\xc2\xa3",
'pound;' => "\xc2\xa3",
'prime;' => "\xe2\x80\xb2",
'prod;' => "\xe2\x88\x8f",
'prop;' => "\xe2\x88\x9d",
'psi;' => "\xcf\x88",
'quot' => '"',
'quot;' => '"',
'rArr;' => "\xe2\x87\x92",
'radic;' => "\xe2\x88\x9a",
'rang;' => "\xe3\x80\x89",
'raquo' => "\xc2\xbb",
'raquo;' => "\xc2\xbb",
'rarr;' => "\xe2\x86\x92",
'rceil;' => "\xe2\x8c\x89",
'rdquo;' => "\xe2\x80\x9d",
'real;' => "\xe2\x84\x9c",
'reg' => "\xc2\xae",
'reg;' => "\xc2\xae",
'rfloor;' => "\xe2\x8c\x8b",
'rho;' => "\xcf\x81",
'rlm;' => "\xe2\x80\x8f",
'rsaquo;' => "\xe2\x80\xba",
'rsquo;' => "\xe2\x80\x99",
'sbquo;' => "\xe2\x80\x9a",
'scaron;' => "\xc5\xa1",
'sdot;' => "\xe2\x8b\x85",
'sect' => "\xc2\xa7",
'sect;' => "\xc2\xa7",
'shy' => "\xc2\xad",
'shy;' => "\xc2\xad",
'sigma;' => "\xcf\x83",
'sigmaf;' => "\xcf\x82",
'sim;' => "\xe2\x88\xbc",
'spades;' => "\xe2\x99\xa0",
'sub;' => "\xe2\x8a\x82",
'sube;' => "\xe2\x8a\x86",
'sum;' => "\xe2\x88\x91",
'sup1' => "\xc2\xb9",
'sup1;' => "\xc2\xb9",
'sup2' => "\xc2\xb2",
'sup2;' => "\xc2\xb2",
'sup3' => "\xc2\xb3",
'sup3;' => "\xc2\xb3",
'sup;' => "\xe2\x8a\x83",
'supe;' => "\xe2\x8a\x87",
'szlig' => "\xc3\x9f",
'szlig;' => "\xc3\x9f",
'tau;' => "\xcf\x84",
'there4;' => "\xe2\x88\xb4",
'theta;' => "\xce\xb8",
'thetasym;' => "\xcf\x91",
'thinsp;' => "\xe2\x80\x89",
'thorn' => "\xc3\xbe",
'thorn;' => "\xc3\xbe",
'tilde;' => "\xcb\x9c",
'times' => "\xc3\x97",
'times;' => "\xc3\x97",
'trade;' => "\xe2\x84\xa2",
'uArr;' => "\xe2\x87\x91",
'uacute' => "\xc3\xba",
'uacute;' => "\xc3\xba",
'uarr;' => "\xe2\x86\x91",
'ucirc' => "\xc3\xbb",
'ucirc;' => "\xc3\xbb",
'ugrave' => "\xc3\xb9",
'ugrave;' => "\xc3\xb9",
'uml' => "\xc2\xa8",
'uml;' => "\xc2\xa8",
'upsih;' => "\xcf\x92",
'upsilon;' => "\xcf\x85",
'uuml' => "\xc3\xbc",
'uuml;' => "\xc3\xbc",
'weierp;' => "\xe2\x84\x98",
'xi;' => "\xce\xbe",
'yacute' => "\xc3\xbd",
'yacute;' => "\xc3\xbd",
'yen' => "\xc2\xa5",
'yen;' => "\xc2\xa5",
'yuml' => "\xc3\xbf",
'yuml;' => "\xc3\xbf",
'zeta;' => "\xce\xb6",
'zwj;' => "\xe2\x80\x8d",
'zwnj;' => "\xe2\x80\x8c"
}
ENCODINGS = %w[
ansi_x3.4-1968
iso-ir-6
ansi_x3.4-1986
iso_646.irv:1991
ascii
iso646-us
us-ascii
us
ibm367
cp367
csascii
ks_c_5601-1987
korean
iso-2022-kr
csiso2022kr
euc-kr
iso-2022-jp
csiso2022jp
iso-2022-jp-2
iso-ir-58
chinese
csiso58gb231280
iso_8859-1:1987
iso-ir-100
iso_8859-1
iso-8859-1
latin1
l1
ibm819
cp819
csisolatin1
iso_8859-2:1987
iso-ir-101
iso_8859-2
iso-8859-2
latin2
l2
csisolatin2
iso_8859-3:1988
iso-ir-109
iso_8859-3
iso-8859-3
latin3
l3
csisolatin3
iso_8859-4:1988
iso-ir-110
iso_8859-4
iso-8859-4
latin4
l4
csisolatin4
iso_8859-6:1987
iso-ir-127
iso_8859-6
iso-8859-6
ecma-114
asmo-708
arabic
csisolatinarabic
iso_8859-7:1987
iso-ir-126
iso_8859-7
iso-8859-7
elot_928
ecma-118
greek
greek8
csisolatingreek
iso_8859-8:1988
iso-ir-138
iso_8859-8
iso-8859-8
hebrew
csisolatinhebrew
iso_8859-5:1988
iso-ir-144
iso_8859-5
iso-8859-5
cyrillic
csisolatincyrillic
iso_8859-9:1989
iso-ir-148
iso_8859-9
iso-8859-9
latin5
l5
csisolatin5
iso-8859-10
iso-ir-157
l6
iso_8859-10:1992
csisolatin6
latin6
hp-roman8
roman8
r8
ibm037
cp037
csibm037
ibm424
cp424
csibm424
ibm437
cp437
437
cspc8codepage437
ibm500
cp500
csibm500
ibm775
cp775
cspc775baltic
ibm850
cp850
850
cspc850multilingual
ibm852
cp852
852
cspcp852
ibm855
cp855
855
csibm855
ibm857
cp857
857
csibm857
ibm860
cp860
860
csibm860
ibm861
cp861
861
cp-is
csibm861
ibm862
cp862
862
cspc862latinhebrew
ibm863
cp863
863
csibm863
ibm864
cp864
csibm864
ibm865
cp865
865
csibm865
ibm866
cp866
866
csibm866
ibm869
cp869
869
cp-gr
csibm869
ibm1026
cp1026
csibm1026
koi8-r
cskoi8r
koi8-u
big5-hkscs
ptcp154
csptcp154
pt154
cp154
utf-7
utf-16be
utf-16le
utf-16
utf-8
iso-8859-13
iso-8859-14
iso-ir-199
iso_8859-14:1998
iso_8859-14
latin8
iso-celtic
l8
iso-8859-15
iso_8859-15
iso-8859-16
iso-ir-226
iso_8859-16:2001
iso_8859-16
latin10
l10
gbk
cp936
ms936
gb18030
shift_jis
ms_kanji
csshiftjis
euc-jp
gb2312
big5
csbig5
windows-1250
windows-1251
windows-1252
windows-1253
windows-1254
windows-1255
windows-1256
windows-1257
windows-1258
tis-620
hz-gb-2312
]
end

View file

@ -0,0 +1 @@
require 'html5/filters/optionaltags'

View file

@ -1,7 +1,7 @@
require 'delegate' require 'delegate'
require 'enumerator' require 'enumerator'
module HTML5lib module HTML5
module Filters module Filters
class Base < SimpleDelegator class Base < SimpleDelegator
include Enumerable include Enumerable

View file

@ -1,6 +1,6 @@
require 'html5lib/filters/base' require 'html5/filters/base'
module HTML5lib module HTML5
module Filters module Filters
class InjectMetaCharset < Base class InjectMetaCharset < Base
def initialize(source, encoding) def initialize(source, encoding)

View file

@ -1,7 +1,7 @@
require 'html5lib/constants' require 'html5/constants'
require 'html5lib/filters/base' require 'html5/filters/base'
module HTML5lib module HTML5
module Filters module Filters
class OptionalTagFilter < Base class OptionalTagFilter < Base

View file

@ -1,7 +1,7 @@
require 'html5lib/filters/base' require 'html5/filters/base'
require 'html5lib/sanitizer' require 'html5/sanitizer'
module HTML5lib module HTML5
module Filters module Filters
class HTMLSanitizeFilter < Base class HTMLSanitizeFilter < Base
include HTMLSanitizeModule include HTMLSanitizeModule

View file

@ -1,7 +1,7 @@
require 'html5lib/constants' require 'html5/constants'
require 'html5lib/filters/base' require 'html5/filters/base'
module HTML5lib module HTML5
module Filters module Filters
class WhitespaceFilter < Base class WhitespaceFilter < Base

View file

@ -1,12 +1,12 @@
require 'html5lib/constants' require 'html5/constants'
require 'html5lib/tokenizer' require 'html5/tokenizer'
require 'html5lib/treebuilders/rexml' require 'html5/treebuilders/rexml'
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path| Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
require 'html5lib/html5parser/' + File.basename(path) require 'html5/html5parser/' + File.basename(path)
end end
module HTML5lib module HTML5
# Error in parsed document # Error in parsed document
class ParseError < Exception; end class ParseError < Exception; end
@ -37,7 +37,7 @@ module HTML5lib
# :strict - raise an exception when a parse error is encountered # :strict - raise an exception when a parse error is encountered
# :tree - a treebuilder class controlling the type of tree that will be # :tree - a treebuilder class controlling the type of tree that will be
# returned. Built in treebuilders can be accessed through # returned. Built in treebuilders can be accessed through
# HTML5lib::TreeBuilders[treeType] # HTML5::TreeBuilders[treeType]
def initialize(options = {}) def initialize(options = {})
@strict = false @strict = false
@errors = [] @errors = []
@ -51,7 +51,7 @@ module HTML5lib
@phases = @@phases.inject({}) do |phases, phase_name| @phases = @@phases.inject({}) do |phases, phase_name|
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase' phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
phases[phase_name.to_sym] = HTML5lib.const_get(phase_class_name).new(self, @tree) phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
phases phases
end end
end end

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class AfterBodyPhase < Phase class AfterBodyPhase < Phase
handle_end 'html' handle_end 'html'

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class AfterFramesetPhase < Phase class AfterFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#after3 # http://www.whatwg.org/specs/web-apps/current-work/#after3

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class AfterHeadPhase < Phase class AfterHeadPhase < Phase
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead' handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'

View file

@ -1,11 +1,11 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class BeforeHeadPhase < Phase class BeforeHeadPhase < Phase
handle_start 'html', 'head' handle_start 'html', 'head'
handle_end %w( html head body br ) => 'ImplyHead' handle_end %w( html head body br p ) => 'ImplyHead'
def processEOF def processEOF
startTagHead('head', {}) startTagHead('head', {})

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InBodyPhase < Phase class InBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-body # http://www.whatwg.org/specs/web-apps/current-work/#in-body
@ -112,7 +112,7 @@ module HTML5lib
def startTagForm(name, attributes) def startTagForm(name, attributes)
if @tree.formPointer if @tree.formPointer
@parser.parseError('Unexpected start tag (form). Ignored.') @parser.parseError(_('Unexpected start tag (form). Ignored.'))
else else
endTagP('p') if in_scope?('p') endTagP('p') if in_scope?('p')
@tree.insertElement(name, attributes) @tree.insertElement(name, attributes)
@ -129,9 +129,9 @@ module HTML5lib
if stopName.include?(node.name) if stopName.include?(node.name)
poppedNodes = (0..i).collect { @tree.openElements.pop } poppedNodes = (0..i).collect { @tree.openElements.pop }
if i >= 1 if i >= 1
@parser.parseError("Missing end tag%s (%s)" % [ @parser.parseError(_("Missing end tag%s (%s)" % [
(i>1 ? 's' : ''), (i>1 ? 's' : ''),
poppedNodes.reverse.map {|item| item.name}.join(', ')]) poppedNodes.reverse.map {|item| item.name}.join(', ')]))
end end
break break
end end
@ -251,7 +251,7 @@ module HTML5lib
end end
def startTagIsindex(name, attributes) def startTagIsindex(name, attributes)
@parser.parseError("Unexpected start tag isindex. Don't use it!") @parser.parseError(_("Unexpected start tag isindex. Don't use it!"))
return if @tree.formPointer return if @tree.formPointer
processStartTag('form', {}) processStartTag('form', {})
processStartTag('hr', {}) processStartTag('hr', {})
@ -311,8 +311,13 @@ module HTML5lib
def endTagP(name) def endTagP(name)
@tree.generateImpliedEndTags('p') if in_scope?('p') @tree.generateImpliedEndTags('p') if in_scope?('p')
@parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p' @parser.parseError(_('Unexpected end tag (p).')) unless @tree.openElements[-1].name == 'p'
if in_scope?('p')
@tree.openElements.pop while in_scope?('p') @tree.openElements.pop while in_scope?('p')
else
startTagCloseP('p', {})
endTagP('p')
end
end end
def endTagBody(name) def endTagBody(name)
@ -342,7 +347,7 @@ module HTML5lib
@tree.generateImpliedEndTags if in_scope?(name) @tree.generateImpliedEndTags if in_scope?(name)
unless @tree.openElements[-1].name == name unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag.")) @parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag."))
end end
if in_scope?(name) if in_scope?(name)
@ -351,7 +356,14 @@ module HTML5lib
end end
def endTagForm(name) def endTagForm(name)
endTagBlock(name) if in_scope?(name)
@tree.generateImpliedEndTags
end
if @tree.openElements[-1].name != name
@parser.parseError(_("End tag (form) seen too early. Ignored."))
else
@tree.openElements.pop
end
@tree.formPointer = nil @tree.formPointer = nil
end end
@ -361,7 +373,7 @@ module HTML5lib
@tree.generateImpliedEndTags(name) @tree.generateImpliedEndTags(name)
unless @tree.openElements[-1].name == name unless @tree.openElements[-1].name == name
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag.")) @parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag."))
end end
end end
@ -377,7 +389,7 @@ module HTML5lib
end end
unless @tree.openElements[-1].name == name unless @tree.openElements[-1].name == name
@parser.parseError(("Unexpected end tag (#{name}). Expected other end tag.")) @parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag."))
end end
HEADING_ELEMENTS.each do |element| HEADING_ELEMENTS.each do |element|

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InCaptionPhase < Phase class InCaptionPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption # http://www.whatwg.org/specs/web-apps/current-work/#in-caption

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InCellPhase < Phase class InCellPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell # http://www.whatwg.org/specs/web-apps/current-work/#in-cell

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InColumnGroupPhase < Phase class InColumnGroupPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-column # http://www.whatwg.org/specs/web-apps/current-work/#in-column

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InFramesetPhase < Phase class InFramesetPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset

View file

@ -1,12 +1,12 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InHeadPhase < Phase class InHeadPhase < Phase
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta ) handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
handle_end 'head' handle_end 'head'
handle_end %w( html body br ) => 'ImplyAfterHead' handle_end %w( html body br p ) => 'ImplyAfterHead'
handle_end %w( title style script ) handle_end %w( title style script )
def processEOF def processEOF

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InRowPhase < Phase class InRowPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-row # http://www.whatwg.org/specs/web-apps/current-work/#in-row

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InSelectPhase < Phase class InSelectPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-select # http://www.whatwg.org/specs/web-apps/current-work/#in-select

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InTableBodyPhase < Phase class InTableBodyPhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InTablePhase < Phase class InTablePhase < Phase
# http://www.whatwg.org/specs/web-apps/current-work/#in-table # http://www.whatwg.org/specs/web-apps/current-work/#in-table

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class InitialPhase < Phase class InitialPhase < Phase
# This phase deals with error handling as well which is currently not # This phase deals with error handling as well which is currently not

View file

@ -1,4 +1,4 @@
module HTML5lib module HTML5
# Base class for helper objects that implement each phase of processing. # Base class for helper objects that implement each phase of processing.
# #
# Handler methods should be in the following order (they can be omitted): # Handler methods should be in the following order (they can be omitted):

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class RootElementPhase < Phase class RootElementPhase < Phase
def processEOF def processEOF

View file

@ -1,6 +1,6 @@
require 'html5lib/html5parser/phase' require 'html5/html5parser/phase'
module HTML5lib module HTML5
class TrailingEndPhase < Phase class TrailingEndPhase < Phase
def processEOF def processEOF

View file

@ -1,7 +1,7 @@
require 'stringio' require 'stringio'
require 'html5lib/constants' require 'html5/constants'
module HTML5lib module HTML5
# Provides a unicode stream of characters to the HTMLTokenizer. # Provides a unicode stream of characters to the HTMLTokenizer.
@ -10,7 +10,7 @@ module HTML5lib
class HTMLInputStream class HTMLInputStream
attr_accessor :queue, :char_encoding attr_accessor :queue, :char_encoding, :errors
# Initialises the HTMLInputStream. # Initialises the HTMLInputStream.
# #
@ -40,25 +40,31 @@ module HTML5lib
#Number of bytes to use when looking for a meta element with #Number of bytes to use when looking for a meta element with
#encoding information #encoding information
@NUM_BYTES_META = 512 @NUM_BYTES_META = 512
#Number of bytes to use when using detecting encoding using chardet
@NUM_BYTES_CHARDET = 256
#Number of bytes to use when reading content
@NUM_BYTES_BUFFER = 1024
#Encoding to use if no other information can be found #Encoding to use if no other information can be found
@DEFAULT_ENCODING = 'windows-1252' @DEFAULT_ENCODING = 'windows-1252'
#Detect encoding iff no explicit "transport level" encoding is supplied #Detect encoding iff no explicit "transport level" encoding is supplied
if @encoding.nil? or not HTML5lib.is_valid_encoding(@encoding) if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
@char_encoding = detect_encoding @char_encoding = detect_encoding
else else
@char_encoding = @encoding @char_encoding = @encoding
end end
# Read bytes from stream decoding them into Unicode # Read bytes from stream decoding them into Unicode
uString = @raw_stream.read @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
if @char_encoding == 'windows-1252' if @char_encoding == 'windows-1252'
@win1252 = true @win1252 = true
elsif @char_encoding != 'utf-8' elsif @char_encoding != 'utf-8'
begin begin
require 'iconv' require 'iconv'
begin begin
uString = Iconv.iconv('utf-8', @char_encoding, uString).first @buffer << @raw_stream.read unless @raw_stream.eof?
@buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
rescue rescue
@win1252 = true @win1252 = true
end end
@ -67,10 +73,8 @@ module HTML5lib
end end
end end
# Convert the unicode string into a list to be used as the data stream
@data_stream = uString
@queue = [] @queue = []
@errors = []
# Reset position in the list to read from # Reset position in the list to read from
@tell = 0 @tell = 0
@ -109,9 +113,22 @@ module HTML5lib
begin begin
require 'rubygems' require 'rubygems'
require 'UniversalDetector' # gem install chardet require 'UniversalDetector' # gem install chardet
buffer = @raw_stream.read buffers = []
encoding = UniversalDetector::chardet(buffer)['encoding'] detector = UniversalDetector::Detector.instance
seek(buffer, 0) detector.reset
until @raw_stream.eof?
buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
break if !buffer or buffer.empty?
buffers << buffer
detector.feed(buffer)
break if detector.instance_eval {@done}
detector.instance_eval {
@_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
}
end
detector.close
encoding = detector.result['encoding']
seek(buffers*'', 0)
rescue LoadError rescue LoadError
end end
end end
@ -242,14 +259,20 @@ module HTML5lib
unless @queue.empty? unless @queue.empty?
return @queue.shift return @queue.shift
else else
c = @data_stream[@tell] if @tell + 3 > @buffer.length and !@raw_stream.eof?
# read next block
@buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
@tell = 0
end
c = @buffer[@tell]
@tell += 1 @tell += 1
case c case c
when 0x01 .. 0x7F when 0x01 .. 0x7F
if c == 0x0D if c == 0x0D
# normalize newlines # normalize newlines
@tell += 1 if @data_stream[@tell] == 0x0A @tell += 1 if @buffer[@tell] == 0x0A
c = 0x0A c = 0x0A
end end
@ -276,7 +299,7 @@ module HTML5lib
when 0xC0 .. 0xFF when 0xC0 .. 0xFF
if @win1252 if @win1252
"\xC3" + (c-64).chr # convert to utf-8 "\xC3" + (c-64).chr # convert to utf-8
elsif @data_stream[@tell-1 .. -1] =~ /^ elsif @buffer[@tell-1 .. @tell+3] =~ /^
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte ( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
@ -292,6 +315,8 @@ module HTML5lib
end end
when 0x00 when 0x00
@errors.push('null character found in input stream, ' +
'replaced with U+FFFD')
[0xFFFD].pack('U') # null characters are invalid [0xFFFD].pack('U') # null characters are invalid
else else
@ -317,6 +342,10 @@ module HTML5lib
@queue.insert(0, c) unless c == :EOF @queue.insert(0, c) unless c == :EOF
return char_stack.join('') return char_stack.join('')
end end
def unget(characters)
@queue.unshift(*characters.to_a) unless characters == :EOF
end
end end
# String-like object with an assosiated position and various extra methods # String-like object with an assosiated position and various extra methods
@ -433,14 +462,14 @@ module HTML5lib
if attr[0] == 'charset' if attr[0] == 'charset'
tentative_encoding = attr[1] tentative_encoding = attr[1]
if HTML5lib.is_valid_encoding(tentative_encoding) if HTML5.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding @encoding = tentative_encoding
return false return false
end end
elsif attr[0] == 'content' elsif attr[0] == 'content'
content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1])) content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
tentative_encoding = content_parser.parse tentative_encoding = content_parser.parse
if HTML5lib.is_valid_encoding(tentative_encoding) if HTML5.is_valid_encoding(tentative_encoding)
@encoding = tentative_encoding @encoding = tentative_encoding
return false return false
end end

View file

@ -11,10 +11,10 @@
# #
# @@TODO: # @@TODO:
# * Selectively lowercase only XHTML, but not foreign markup # * Selectively lowercase only XHTML, but not foreign markup
require 'html5lib/html5parser' require 'html5/html5parser'
require 'html5lib/constants' require 'html5/constants'
module HTML5lib module HTML5
# liberal XML parser # liberal XML parser
class XMLParser < HTMLParser class XMLParser < HTMLParser
@ -25,25 +25,35 @@ module HTML5lib
end end
def normalizeToken(token) def normalizeToken(token)
if token[:type] == :StartTag or token[:type] == :EmptyTag case token[:type]
when :StartTag, :EmptyTag
# We need to remove the duplicate attributes and convert attributes # We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} # to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
token[:data] = Hash[*token[:data].reverse.flatten] token[:data] = Hash[*token[:data].reverse.flatten]
# For EmptyTags, process both a Start and an End tag # For EmptyTags, process both a Start and an End tag
if token[:type] == :EmptyTag if token[:type] == :EmptyTag
save = @tokenizer.contentModelFlag
@phase.processStartTag(token[:name], token[:data]) @phase.processStartTag(token[:name], token[:data])
@tokenizer.contentModelFlag = save
token[:data] = {} token[:data] = {}
token[:type] = :EndTag token[:type] = :EndTag
end end
elsif token[:type] == :EndTag when :Characters
# un-escape RCDATA_ELEMENTS (e.g. style, script)
if @tokenizer.contentModelFlag == :CDATA
token[:data] = token[:data].
gsub('&lt;','<').gsub('&gt;','>').gsub('&amp;','&')
end
when :EndTag
if token[:data] if token[:data]
parseError(_("End tag contains unexpected attributes.")) parseError(_("End tag contains unexpected attributes."))
end end
elsif token[:type] == :Comment when :Comment
# Rescue CDATA from the comments # Rescue CDATA from the comments
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]" if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
token[:type] = :Characters token[:type] = :Characters

View file

@ -1,6 +1,7 @@
require 'cgi' require 'cgi'
require 'html5/tokenizer'
module HTML5lib module HTML5
# This module provides sanitization of XHTML+MathML+SVG # This module provides sanitization of XHTML+MathML+SVG
# and of inline style attributes. # and of inline style attributes.

View file

@ -0,0 +1,2 @@
require 'html5/serializer/htmlserializer'
require 'html5/serializer/xhtmlserializer'

View file

@ -1,6 +1,6 @@
require 'html5lib/constants' require 'html5/constants'
module HTML5lib module HTML5
class HTMLSerializer class HTMLSerializer
@ -21,6 +21,7 @@ module HTML5lib
@use_trailing_solidus = false @use_trailing_solidus = false
@space_before_trailing_solidus = true @space_before_trailing_solidus = true
@escape_lt_in_attrs = false @escape_lt_in_attrs = false
@escape_rcdata = false
@omit_optional_tags = true @omit_optional_tags = true
@sanitize = false @sanitize = false
@ -43,22 +44,22 @@ module HTML5lib
@errors = [] @errors = []
if encoding and @inject_meta_charset if encoding and @inject_meta_charset
require 'html5lib/filters/inject_meta_charset' require 'html5/filters/inject_meta_charset'
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding) treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
end end
if @strip_whitespace if @strip_whitespace
require 'html5lib/filters/whitespace' require 'html5/filters/whitespace'
treewalker = Filters::WhitespaceFilter.new(treewalker) treewalker = Filters::WhitespaceFilter.new(treewalker)
end end
if @sanitize if @sanitize
require 'html5lib/filters/sanitizer' require 'html5/filters/sanitizer'
treewalker = Filters::HTMLSanitizeFilter.new(treewalker) treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
end end
if @omit_optional_tags if @omit_optional_tags
require 'html5lib/filters/optionaltags' require 'html5/filters/optionaltags'
treewalker = Filters::OptionalTagFilter.new(treewalker) treewalker = Filters::OptionalTagFilter.new(treewalker)
end end
@ -81,7 +82,7 @@ module HTML5lib
elsif [:StartTag, :EmptyTag].include? type elsif [:StartTag, :EmptyTag].include? type
name = token[:name] name = token[:name]
if RCDATA_ELEMENTS.include?(name) if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata
in_cdata = true in_cdata = true
elsif in_cdata elsif in_cdata
serializeError(_("Unexpected child element of a CDATA element")) serializeError(_("Unexpected child element of a CDATA element"))

View file

@ -1,6 +1,6 @@
require 'html5lib/serializer/htmlserializer' require 'html5/serializer/htmlserializer'
module HTML5lib module HTML5
class XHTMLSerializer < HTMLSerializer class XHTMLSerializer < HTMLSerializer
DEFAULTS = { DEFAULTS = {
@ -8,7 +8,8 @@ module HTML5lib
:minimize_boolean_attributes => false, :minimize_boolean_attributes => false,
:use_trailing_solidus => true, :use_trailing_solidus => true,
:escape_lt_in_attrs => true, :escape_lt_in_attrs => true,
:omit_optional_tags => false :omit_optional_tags => false,
:escape_rcdata => true
} }
def initialize(options={}) def initialize(options={})

View file

@ -1,7 +1,7 @@
require 'html5lib/constants' require 'html5/constants'
require 'html5lib/inputstream' require 'html5/inputstream'
module HTML5lib module HTML5
# This class takes care of tokenizing HTML. # This class takes care of tokenizing HTML.
# #
@ -84,9 +84,9 @@ module HTML5lib
# Start processing. When EOF is reached @state will return false # Start processing. When EOF is reached @state will return false
# instead of true and the loop will terminate. # instead of true and the loop will terminate.
while send @state while send @state
while not @tokenQueue.empty? yield :type => :ParseError, :data => @stream.errors.shift until
yield @tokenQueue.shift @stream.errors.empty?
end yield @tokenQueue.shift until @tokenQueue.empty?
end end
end end
@ -109,7 +109,7 @@ module HTML5lib
# The character we just consumed need to be put back on the stack so it # The character we just consumed need to be put back on the stack so it
# doesn't get lost... # doesn't get lost...
@stream.queue.push(data) @stream.unget(data)
end end
# This function returns either U+FFFD or the character based on the # This function returns either U+FFFD or the character based on the
@ -128,7 +128,6 @@ module HTML5lib
radix = 16 radix = 16
end end
char = [0xFFFD].pack('U')
charStack = [] charStack = []
# Consume all the characters that are in range while making sure we # Consume all the characters that are in range while making sure we
@ -142,17 +141,25 @@ module HTML5lib
# Convert the set of characters consumed to an int. # Convert the set of characters consumed to an int.
charAsInt = charStack.join('').to_i(radix) charAsInt = charStack.join('').to_i(radix)
# If the integer is between 127 and 160 (so 128 and bigger and 159 and if charAsInt == 13
# smaller) we need to do the "windows trick". @tokenQueue.push({:type => :ParseError, :data =>
if (127...160).include? charAsInt _("Incorrect CR newline entity. Replaced with LF.")})
charAsInt = 10
elsif (128..159).include? charAsInt
# If the integer is between 127 and 160 (so 128 and bigger and 159
# and smaller) we need to do the "windows trick".
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Entity used with illegal number (windows-1252 reference).")}) _("Entity used with illegal number (windows-1252 reference).")})
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128] charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
end end
if charAsInt > 0 and charAsInt <= 1114111 if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
char = [charAsInt].pack('U') char = [charAsInt].pack('U')
else
char = [0xFFFD].pack('U')
@tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity represents an illegal codepoint.")})
end end
# Discard the ; if present. Otherwise, put it back on the queue and # Discard the ; if present. Otherwise, put it back on the queue and
@ -160,18 +167,18 @@ module HTML5lib
if c != ";" if c != ";"
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity didn't end with ';'.")}) _("Numeric entity didn't end with ';'.")})
@stream.queue.push(c) @stream.unget(c)
end end
return char return char
end end
def consumeEntity def consumeEntity(from_attribute=false)
char = nil char = nil
charStack = [@stream.char] charStack = [@stream.char]
if SPACE_CHARACTERS.include?(charStack[0]) or if SPACE_CHARACTERS.include?(charStack[0]) or
[:EOF, '<', '&'].include?(charStack[0]) [:EOF, '<', '&'].include?(charStack[0])
@stream.queue+= charStack @stream.unget(charStack)
elsif charStack[0] == "#" elsif charStack[0] == "#"
# We might have a number entity here. # We might have a number entity here.
charStack += [@stream.char, @stream.char] charStack += [@stream.char, @stream.char]
@ -179,22 +186,22 @@ module HTML5lib
# If we reach the end of the file put everything up to :EOF # If we reach the end of the file put everything up to :EOF
# back in the queue # back in the queue
charStack = charStack[0...charStack.index(:EOF)] charStack = charStack[0...charStack.index(:EOF)]
@stream.queue+= charStack @stream.unget(charStack)
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity expected. Got end of file instead.")}) _("Numeric entity expected. Got end of file instead.")})
else else
if charStack[1].downcase == "x" \ if charStack[1].downcase == "x" \
and HEX_DIGITS.include? charStack[2] and HEX_DIGITS.include? charStack[2]
# Hexadecimal entity detected. # Hexadecimal entity detected.
@stream.queue.push(charStack[2]) @stream.unget(charStack[2])
char = consumeNumberEntity(true) char = consumeNumberEntity(true)
elsif DIGITS.include? charStack[1] elsif DIGITS.include? charStack[1]
# Decimal entity detected. # Decimal entity detected.
@stream.queue += charStack[1..-1] @stream.unget(charStack[1..-1])
char = consumeNumberEntity(false) char = consumeNumberEntity(false)
else else
# No number entity detected. # No number entity detected.
@stream.queue += charStack @stream.unget(charStack)
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Numeric entity expected but none found.")}) _("Numeric entity expected but none found.")})
end end
@ -209,6 +216,8 @@ module HTML5lib
filteredEntityList.reject! {|e| e[0].chr != charStack[0]} filteredEntityList.reject! {|e| e[0].chr != charStack[0]}
entityName = nil entityName = nil
# Try to find the longest entity the string will match to take care
# of &noti for instance.
while charStack[-1] != :EOF while charStack[-1] != :EOF
name = charStack.join('') name = charStack.join('')
if filteredEntityList.any? {|e| e[0...name.length] == name} if filteredEntityList.any? {|e| e[0...name.length] == name}
@ -220,6 +229,7 @@ module HTML5lib
if ENTITIES.include? name if ENTITIES.include? name
entityName = name entityName = name
break if entityName[-1] == ';'
end end
end end
@ -228,15 +238,23 @@ module HTML5lib
# Check whether or not the last character returned can be # Check whether or not the last character returned can be
# discarded or needs to be put back. # discarded or needs to be put back.
if not charStack[-1] == ";" if entityName[-1] != ?;
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Named entity didn't end with ';'.")}) _("Named entity didn't end with ';'.")})
@stream.queue += charStack[entityName.length..-1] end
if charStack[-1] != ";" and from_attribute and
(ASCII_LETTERS.include?(charStack[entityName.length]) or
DIGITS.include?(charStack[entityName.length]))
@stream.unget(charStack)
char = '&'
else
@stream.unget(charStack[entityName.length..-1])
end end
else else
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Named entity expected. Got none.")}) _("Named entity expected. Got none.")})
@stream.queue += charStack @stream.unget(charStack)
end end
end end
return char return char
@ -244,7 +262,7 @@ module HTML5lib
# This method replaces the need for "entityInAttributeValueState". # This method replaces the need for "entityInAttributeValueState".
def processEntityInAttribute def processEntityInAttribute
entity = consumeEntity entity = consumeEntity(true)
if entity if entity
@currentToken[:data][-1][1] += entity @currentToken[:data][-1][1] += entity
else else
@ -274,20 +292,23 @@ module HTML5lib
@lastFourChars.shift if @lastFourChars.length > 4 @lastFourChars.shift if @lastFourChars.length > 4
end end
if data == "&" and [:PCDATA,:RCDATA].include?(@contentModelFlag) if data == "&" and !@escapeFlag and
[:PCDATA,:RCDATA].include?(@contentModelFlag)
@state = @states[:entityData] @state = @states[:entityData]
elsif data == "-" and [:CDATA,:RCDATA].include?(@contentModelFlag) and elsif data == "-" and !@escapeFlag and
@escapeFlag == false and @lastFourChars.join('') == "<!--" [:CDATA,:RCDATA].include?(@contentModelFlag) and
@lastFourChars.join('') == "<!--"
@escapeFlag = true @escapeFlag = true
@tokenQueue.push({:type => :Characters, :data => data}) @tokenQueue.push({:type => :Characters, :data => data})
elsif data == "<" and @escapeFlag == false and elsif data == "<" and !@escapeFlag and
[:PCDATA,:CDATA,:RCDATA].include?(@contentModelFlag) [:PCDATA,:CDATA,:RCDATA].include?(@contentModelFlag)
@state = @states[:tagOpen] @state = @states[:tagOpen]
elsif data == ">" and [:CDATA,:RCDATA].include?(@contentModelFlag) and elsif data == ">" and @escapeFlag and
@escapeFlag == true and @lastFourChars[1..-1].join('') == "-->" [:CDATA,:RCDATA].include?(@contentModelFlag) and
@lastFourChars[1..-1].join('') == "-->"
@escapeFlag = false @escapeFlag = false
@tokenQueue.push({:type => :Characters, :data => data}) @tokenQueue.push({:type => :Characters, :data => data})
@ -345,14 +366,14 @@ module HTML5lib
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Expected tag name. Got '?' instead (HTML doesn't " + _("Expected tag name. Got '?' instead (HTML doesn't " +
"support processing instructions).")}) "support processing instructions).")})
@stream.queue.push(data) @stream.unget(data)
@state = @states[:bogusComment] @state = @states[:bogusComment]
else else
# XXX # XXX
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Expected tag name. Got something else instead")}) _("Expected tag name. Got something else instead")})
@tokenQueue.push({:type => :Characters, :data => "<"}) @tokenQueue.push({:type => :Characters, :data => "<"})
@stream.queue.push(data) @stream.unget(data)
@state = @states[:data] @state = @states[:data]
end end
else else
@ -363,7 +384,7 @@ module HTML5lib
@state = @states[:closeTagOpen] @state = @states[:closeTagOpen]
else else
@tokenQueue.push({:type => :Characters, :data => "<"}) @tokenQueue.push({:type => :Characters, :data => "<"})
@stream.queue.insert(0, data) @stream.unget(data)
@state = @states[:data] @state = @states[:data]
end end
end end
@ -388,7 +409,7 @@ module HTML5lib
# Since this is just for checking. We put the characters back on # Since this is just for checking. We put the characters back on
# the stack. # the stack.
@stream.queue += charStack @stream.unget(charStack)
end end
if @currentToken and if @currentToken and
@ -426,7 +447,7 @@ module HTML5lib
# XXX data can be _'_... # XXX data can be _'_...
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Expected closing tag. Unexpected character '#{data}' found.")}) _("Expected closing tag. Unexpected character '#{data}' found.")})
@stream.queue.push(data) @stream.unget(data)
@state = @states[:bogusComment] @state = @states[:bogusComment]
end end
@ -556,7 +577,7 @@ module HTML5lib
@state = @states[:attributeValueDoubleQuoted] @state = @states[:attributeValueDoubleQuoted]
elsif data == "&" elsif data == "&"
@state = @states[:attributeValueUnQuoted] @state = @states[:attributeValueUnQuoted]
@stream.queue.push(data); @stream.unget(data);
elsif data == "'" elsif data == "'"
@state = @states[:attributeValueSingleQuoted] @state = @states[:attributeValueSingleQuoted]
elsif data == ">" elsif data == ">"
@ -656,7 +677,7 @@ module HTML5lib
else else
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Expected '--' or 'DOCTYPE'. Not found.")}) _("Expected '--' or 'DOCTYPE'. Not found.")})
@stream.queue += charStack @stream.unget(charStack)
@state = @states[:bogusComment] @state = @states[:bogusComment]
end end
end end
@ -771,7 +792,7 @@ module HTML5lib
else else
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("No space after literal string 'DOCTYPE'.")}) _("No space after literal string 'DOCTYPE'.")})
@stream.queue.push(data) @stream.unget(data)
@state = @states[:beforeDoctypeName] @state = @states[:beforeDoctypeName]
end end
return true return true
@ -827,7 +848,7 @@ module HTML5lib
@state = @states[:data] @state = @states[:data]
elsif data == :EOF elsif data == :EOF
@currentToken[:data] = true @currentToken[:data] = true
@stream.queue.push(data) @stream.unget(data)
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in DOCTYPE.")}) _("Unexpected end of file in DOCTYPE.")})
@currentToken[:correct] = false @currentToken[:correct] = false
@ -842,7 +863,7 @@ module HTML5lib
elsif token == "system" elsif token == "system"
@state = @states[:beforeDoctypeSystemIdentifier] @state = @states[:beforeDoctypeSystemIdentifier]
else else
@stream.queue += charStack @stream.unget(charStack)
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Expected 'public' or 'system'. Got '#{charStack.join('')}'")}) _("Expected 'public' or 'system'. Got '#{charStack.join('')}'")})
@state = @states[:bogusDoctype] @state = @states[:bogusDoctype]
@ -1028,7 +1049,7 @@ module HTML5lib
@state = @states[:data] @state = @states[:data]
elsif data == :EOF elsif data == :EOF
# XXX EMIT # XXX EMIT
@stream.queue.push(data) @stream.unget(data)
@tokenQueue.push({:type => :ParseError, :data => @tokenQueue.push({:type => :ParseError, :data =>
_("Unexpected end of file in bogus doctype.")}) _("Unexpected end of file in bogus doctype.")})
@currentToken[:correct] = false @currentToken[:correct] = false

View file

@ -1,17 +1,17 @@
module HTML5lib module HTML5
module TreeBuilders module TreeBuilders
class << self class << self
def [](name) def [](name)
case name.to_s.downcase case name.to_s.downcase
when 'simpletree' then when 'simpletree' then
require 'html5lib/treebuilders/simpletree' require 'html5/treebuilders/simpletree'
SimpleTree::TreeBuilder SimpleTree::TreeBuilder
when 'rexml' then when 'rexml' then
require 'html5lib/treebuilders/rexml' require 'html5/treebuilders/rexml'
REXML::TreeBuilder REXML::TreeBuilder
when 'hpricot' then when 'hpricot' then
require 'html5lib/treebuilders/hpricot' require 'html5/treebuilders/hpricot'
Hpricot::TreeBuilder Hpricot::TreeBuilder
else else
raise "Unknown TreeBuilder #{name}" raise "Unknown TreeBuilder #{name}"

View file

@ -1,8 +1,8 @@
require 'html5lib/constants' require 'html5/constants'
#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like #XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
module HTML5lib module HTML5
# The scope markers are inserted when entering buttons, object elements, # The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting # marquees, table cells, and table captions, and are used to prevent formatting

View file

@ -1,9 +1,9 @@
require 'html5lib/treebuilders/base' require 'html5/treebuilders/base'
require 'rubygems' require 'rubygems'
require 'hpricot' require 'hpricot'
require 'forwardable' require 'forwardable'
module HTML5lib module HTML5
module TreeBuilders module TreeBuilders
module Hpricot module Hpricot

View file

@ -1,8 +1,8 @@
require 'html5lib/treebuilders/base' require 'html5/treebuilders/base'
require 'rexml/document' require 'rexml/document'
require 'forwardable' require 'forwardable'
module HTML5lib module HTML5
module TreeBuilders module TreeBuilders
module REXML module REXML

View file

@ -1,6 +1,6 @@
require 'html5lib/treebuilders/base' require 'html5/treebuilders/base'
module HTML5lib module HTML5
module TreeBuilders module TreeBuilders
module SimpleTree module SimpleTree

View file

@ -1,19 +1,19 @@
require 'html5lib/treewalkers/base' require 'html5/treewalkers/base'
module HTML5lib module HTML5
module TreeWalkers module TreeWalkers
class << self class << self
def [](name) def [](name)
case name.to_s.downcase case name.to_s.downcase
when 'simpletree' then when 'simpletree' then
require 'html5lib/treewalkers/simpletree' require 'html5/treewalkers/simpletree'
SimpleTree::TreeWalker SimpleTree::TreeWalker
when 'rexml' then when 'rexml' then
require 'html5lib/treewalkers/rexml' require 'html5/treewalkers/rexml'
REXML::TreeWalker REXML::TreeWalker
when 'hpricot' then when 'hpricot' then
require 'html5lib/treewalkers/hpricot' require 'html5/treewalkers/hpricot'
Hpricot::TreeWalker Hpricot::TreeWalker
else else
raise "Unknown TreeWalker #{name}" raise "Unknown TreeWalker #{name}"

View file

@ -1,5 +1,5 @@
require 'html5lib/constants' require 'html5/constants'
module HTML5lib module HTML5
module TreeWalkers module TreeWalkers
module TokenConstructor module TokenConstructor

View file

@ -1,10 +1,10 @@
require 'html5lib/treewalkers/base' require 'html5/treewalkers/base'
require 'rexml/document' require 'rexml/document'
module HTML5lib module HTML5
module TreeWalkers module TreeWalkers
module Hpricot module Hpricot
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
def node_details(node) def node_details(node)
case node case node

View file

@ -1,10 +1,10 @@
require 'html5lib/treewalkers/base' require 'html5/treewalkers/base'
require 'rexml/document' require 'rexml/document'
module HTML5lib module HTML5
module TreeWalkers module TreeWalkers
module REXML module REXML
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
def node_details(node) def node_details(node)
case node case node

View file

@ -1,10 +1,10 @@
require 'html5lib/treewalkers/base' require 'html5/treewalkers/base'
module HTML5lib module HTML5
module TreeWalkers module TreeWalkers
module SimpleTree module SimpleTree
class TreeWalker < HTML5lib::TreeWalkers::Base class TreeWalker < HTML5::TreeWalkers::Base
include HTML5lib::TreeBuilders::SimpleTree include HTML5::TreeBuilders::SimpleTree
def walk(node) def walk(node)
case node case node

View file

@ -1,708 +0,0 @@
module HTML5lib
class EOF < Exception; end
CONTENT_MODEL_FLAGS = [
:PCDATA,
:RCDATA,
:CDATA,
:PLAINTEXT
]
SCOPING_ELEMENTS = %w[
button
caption
html
marquee
object
table
td
th
]
FORMATTING_ELEMENTS = %w[
a
b
big
em
font
i
nobr
s
small
strike
strong
tt
u
]
SPECIAL_ELEMENTS = %w[
address
area
base
basefont
bgsound
blockquote
body
br
center
col
colgroup
dd
dir
div
dl
dt
embed
fieldset
form
frame
frameset
h1
h2
h3
h4
h5
h6
head
hr
iframe
image
img
input
isindex
li
link
listing
menu
meta
noembed
noframes
noscript
ol
optgroup
option
p
param
plaintext
pre
script
select
spacer
style
tbody
textarea
tfoot
thead
title
tr
ul
wbr
]
SPACE_CHARACTERS = %W[
\t
\n
\x0B
\x0C
\x20
\r
]
TABLE_INSERT_MODE_ELEMENTS = %w[
table
tbody
tfoot
thead
tr
]
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
DIGITS = '0'..'9'
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
# Heading elements need to be ordered
HEADING_ELEMENTS = %w[
h1
h2
h3
h4
h5
h6
]
# XXX What about event-source and command?
VOID_ELEMENTS = %w[
base
link
meta
hr
br
img
embed
param
area
col
input
]
CDATA_ELEMENTS = %w[title textarea]
RCDATA_ELEMENTS = %w[
style
script
xmp
iframe
noembed
noframes
noscript
]
BOOLEAN_ATTRIBUTES = {
:global => %w[irrelevant],
'style' => %w[scoped],
'img' => %w[ismap],
'audio' => %w[autoplay controls],
'video' => %w[autoplay controls],
'script' => %w[defer async],
'details' => %w[open],
'datagrid' => %w[multiple disabled],
'command' => %w[hidden disabled checked default],
'menu' => %w[autosubmit],
'fieldset' => %w[disabled readonly],
'option' => %w[disabled readonly selected],
'optgroup' => %w[disabled readonly],
'button' => %w[disabled autofocus],
'input' => %w[disabled readonly required autofocus checked ismap],
'select' => %w[disabled readonly autofocus multiple],
'output' => %w[disabled readonly]
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
ENTITIES_WINDOWS1252 = [
8364, # 0x80 0x20AC EURO SIGN
65533, # 0x81 UNDEFINED
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
8224, # 0x86 0x2020 DAGGER
8225, # 0x87 0x2021 DOUBLE DAGGER
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
8240, # 0x89 0x2030 PER MILLE SIGN
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
65533, # 0x8D UNDEFINED
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
65533, # 0x8F UNDEFINED
65533, # 0x90 UNDEFINED
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
8226, # 0x95 0x2022 BULLET
8211, # 0x96 0x2013 EN DASH
8212, # 0x97 0x2014 EM DASH
732, # 0x98 0x02DC SMALL TILDE
8482, # 0x99 0x2122 TRADE MARK SIGN
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
65533, # 0x9D UNDEFINED
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
]
private
def self.U n
[n].pack('U')
end
public
ENTITIES = {
"AElig" => U(0xC6),
"Aacute" => U(0xC1),
"Acirc" => U(0xC2),
"Agrave" => U(0xC0),
"Alpha" => U(0x0391),
"Aring" => U(0xC5),
"Atilde" => U(0xC3),
"Auml" => U(0xC4),
"Beta" => U(0x0392),
"Ccedil" => U(0xC7),
"Chi" => U(0x03A7),
"Dagger" => U(0x2021),
"Delta" => U(0x0394),
"ETH" => U(0xD0),
"Eacute" => U(0xC9),
"Ecirc" => U(0xCA),
"Egrave" => U(0xC8),
"Epsilon" => U(0x0395),
"Eta" => U(0x0397),
"Euml" => U(0xCB),
"Gamma" => U(0x0393),
"Iacute" => U(0xCD),
"Icirc" => U(0xCE),
"Igrave" => U(0xCC),
"Iota" => U(0x0399),
"Iuml" => U(0xCF),
"Kappa" => U(0x039A),
"Lambda" => U(0x039B),
"Mu" => U(0x039C),
"Ntilde" => U(0xD1),
"Nu" => U(0x039D),
"OElig" => U(0x0152),
"Oacute" => U(0xD3),
"Ocirc" => U(0xD4),
"Ograve" => U(0xD2),
"Omega" => U(0x03A9),
"Omicron" => U(0x039F),
"Oslash" => U(0xD8),
"Otilde" => U(0xD5),
"Ouml" => U(0xD6),
"Phi" => U(0x03A6),
"Pi" => U(0x03A0),
"Prime" => U(0x2033),
"Psi" => U(0x03A8),
"Rho" => U(0x03A1),
"Scaron" => U(0x0160),
"Sigma" => U(0x03A3),
"THORN" => U(0xDE),
"Tau" => U(0x03A4),
"Theta" => U(0x0398),
"Uacute" => U(0xDA),
"Ucirc" => U(0xDB),
"Ugrave" => U(0xD9),
"Upsilon" => U(0x03A5),
"Uuml" => U(0xDC),
"Xi" => U(0x039E),
"Yacute" => U(0xDD),
"Yuml" => U(0x0178),
"Zeta" => U(0x0396),
"aacute" => U(0xE1),
"acirc" => U(0xE2),
"acute" => U(0xB4),
"aelig" => U(0xE6),
"agrave" => U(0xE0),
"alefsym" => U(0x2135),
"alpha" => U(0x03B1),
"amp" => U(0x26),
"AMP" => U(0x26),
"and" => U(0x2227),
"ang" => U(0x2220),
"apos" => U(0x27),
"aring" => U(0xE5),
"asymp" => U(0x2248),
"atilde" => U(0xE3),
"auml" => U(0xE4),
"bdquo" => U(0x201E),
"beta" => U(0x03B2),
"brvbar" => U(0xA6),
"bull" => U(0x2022),
"cap" => U(0x2229),
"ccedil" => U(0xE7),
"cedil" => U(0xB8),
"cent" => U(0xA2),
"chi" => U(0x03C7),
"circ" => U(0x02C6),
"clubs" => U(0x2663),
"cong" => U(0x2245),
"copy" => U(0xA9),
"COPY" => U(0xA9),
"crarr" => U(0x21B5),
"cup" => U(0x222A),
"curren" => U(0xA4),
"dArr" => U(0x21D3),
"dagger" => U(0x2020),
"darr" => U(0x2193),
"deg" => U(0xB0),
"delta" => U(0x03B4),
"diams" => U(0x2666),
"divide" => U(0xF7),
"eacute" => U(0xE9),
"ecirc" => U(0xEA),
"egrave" => U(0xE8),
"empty" => U(0x2205),
"emsp" => U(0x2003),
"ensp" => U(0x2002),
"epsilon" => U(0x03B5),
"equiv" => U(0x2261),
"eta" => U(0x03B7),
"eth" => U(0xF0),
"euml" => U(0xEB),
"euro" => U(0x20AC),
"exist" => U(0x2203),
"fnof" => U(0x0192),
"forall" => U(0x2200),
"frac12" => U(0xBD),
"frac14" => U(0xBC),
"frac34" => U(0xBE),
"frasl" => U(0x2044),
"gamma" => U(0x03B3),
"ge" => U(0x2265),
"gt" => U(0x3E),
"GT" => U(0x3E),
"hArr" => U(0x21D4),
"harr" => U(0x2194),
"hearts" => U(0x2665),
"hellip" => U(0x2026),
"iacute" => U(0xED),
"icirc" => U(0xEE),
"iexcl" => U(0xA1),
"igrave" => U(0xEC),
"image" => U(0x2111),
"infin" => U(0x221E),
"int" => U(0x222B),
"iota" => U(0x03B9),
"iquest" => U(0xBF),
"isin" => U(0x2208),
"iuml" => U(0xEF),
"kappa" => U(0x03BA),
"lArr" => U(0x21D0),
"lambda" => U(0x03BB),
"lang" => U(0x2329),
"laquo" => U(0xAB),
"larr" => U(0x2190),
"lceil" => U(0x2308),
"ldquo" => U(0x201C),
"le" => U(0x2264),
"lfloor" => U(0x230A),
"lowast" => U(0x2217),
"loz" => U(0x25CA),
"lrm" => U(0x200E),
"lsaquo" => U(0x2039),
"lsquo" => U(0x2018),
"lt" => U(0x3C),
"LT" => U(0x3C),
"macr" => U(0xAF),
"mdash" => U(0x2014),
"micro" => U(0xB5),
"middot" => U(0xB7),
"minus" => U(0x2212),
"mu" => U(0x03BC),
"nabla" => U(0x2207),
"nbsp" => U(0xA0),
"ndash" => U(0x2013),
"ne" => U(0x2260),
"ni" => U(0x220B),
"not" => U(0xAC),
"notin" => U(0x2209),
"nsub" => U(0x2284),
"ntilde" => U(0xF1),
"nu" => U(0x03BD),
"oacute" => U(0xF3),
"ocirc" => U(0xF4),
"oelig" => U(0x0153),
"ograve" => U(0xF2),
"oline" => U(0x203E),
"omega" => U(0x03C9),
"omicron" => U(0x03BF),
"oplus" => U(0x2295),
"or" => U(0x2228),
"ordf" => U(0xAA),
"ordm" => U(0xBA),
"oslash" => U(0xF8),
"otilde" => U(0xF5),
"otimes" => U(0x2297),
"ouml" => U(0xF6),
"para" => U(0xB6),
"part" => U(0x2202),
"permil" => U(0x2030),
"perp" => U(0x22A5),
"phi" => U(0x03C6),
"pi" => U(0x03C0),
"piv" => U(0x03D6),
"plusmn" => U(0xB1),
"pound" => U(0xA3),
"prime" => U(0x2032),
"prod" => U(0x220F),
"prop" => U(0x221D),
"psi" => U(0x03C8),
"quot" => U(0x22),
"QUOT" => U(0x22),
"rArr" => U(0x21D2),
"radic" => U(0x221A),
"rang" => U(0x232A),
"raquo" => U(0xBB),
"rarr" => U(0x2192),
"rceil" => U(0x2309),
"rdquo" => U(0x201D),
"real" => U(0x211C),
"reg" => U(0xAE),
"REG" => U(0xAE),
"rfloor" => U(0x230B),
"rho" => U(0x03C1),
"rlm" => U(0x200F),
"rsaquo" => U(0x203A),
"rsquo" => U(0x2019),
"sbquo" => U(0x201A),
"scaron" => U(0x0161),
"sdot" => U(0x22C5),
"sect" => U(0xA7),
"shy" => U(0xAD),
"sigma" => U(0x03C3),
"sigmaf" => U(0x03C2),
"sim" => U(0x223C),
"spades" => U(0x2660),
"sub" => U(0x2282),
"sube" => U(0x2286),
"sum" => U(0x2211),
"sup" => U(0x2283),
"sup1" => U(0xB9),
"sup2" => U(0xB2),
"sup3" => U(0xB3),
"supe" => U(0x2287),
"szlig" => U(0xDF),
"tau" => U(0x03C4),
"there4" => U(0x2234),
"theta" => U(0x03B8),
"thetasym" => U(0x03D1),
"thinsp" => U(0x2009),
"thorn" => U(0xFE),
"tilde" => U(0x02DC),
"times" => U(0xD7),
"trade" => U(0x2122),
"uArr" => U(0x21D1),
"uacute" => U(0xFA),
"uarr" => U(0x2191),
"ucirc" => U(0xFB),
"ugrave" => U(0xF9),
"uml" => U(0xA8),
"upsih" => U(0x03D2),
"upsilon" => U(0x03C5),
"uuml" => U(0xFC),
"weierp" => U(0x2118),
"xi" => U(0x03BE),
"yacute" => U(0xFD),
"yen" => U(0xA5),
"yuml" => U(0xFF),
"zeta" => U(0x03B6),
"zwj" => U(0x200D),
"zwnj" => U(0x200C)
}
ENCODINGS = %w[
ansi_x3.4-1968
iso-ir-6
ansi_x3.4-1986
iso_646.irv:1991
ascii
iso646-us
us-ascii
us
ibm367
cp367
csascii
ks_c_5601-1987
korean
iso-2022-kr
csiso2022kr
euc-kr
iso-2022-jp
csiso2022jp
iso-2022-jp-2
iso-ir-58
chinese
csiso58gb231280
iso_8859-1:1987
iso-ir-100
iso_8859-1
iso-8859-1
latin1
l1
ibm819
cp819
csisolatin1
iso_8859-2:1987
iso-ir-101
iso_8859-2
iso-8859-2
latin2
l2
csisolatin2
iso_8859-3:1988
iso-ir-109
iso_8859-3
iso-8859-3
latin3
l3
csisolatin3
iso_8859-4:1988
iso-ir-110
iso_8859-4
iso-8859-4
latin4
l4
csisolatin4
iso_8859-6:1987
iso-ir-127
iso_8859-6
iso-8859-6
ecma-114
asmo-708
arabic
csisolatinarabic
iso_8859-7:1987
iso-ir-126
iso_8859-7
iso-8859-7
elot_928
ecma-118
greek
greek8
csisolatingreek
iso_8859-8:1988
iso-ir-138
iso_8859-8
iso-8859-8
hebrew
csisolatinhebrew
iso_8859-5:1988
iso-ir-144
iso_8859-5
iso-8859-5
cyrillic
csisolatincyrillic
iso_8859-9:1989
iso-ir-148
iso_8859-9
iso-8859-9
latin5
l5
csisolatin5
iso-8859-10
iso-ir-157
l6
iso_8859-10:1992
csisolatin6
latin6
hp-roman8
roman8
r8
ibm037
cp037
csibm037
ibm424
cp424
csibm424
ibm437
cp437
437
cspc8codepage437
ibm500
cp500
csibm500
ibm775
cp775
cspc775baltic
ibm850
cp850
850
cspc850multilingual
ibm852
cp852
852
cspcp852
ibm855
cp855
855
csibm855
ibm857
cp857
857
csibm857
ibm860
cp860
860
csibm860
ibm861
cp861
861
cp-is
csibm861
ibm862
cp862
862
cspc862latinhebrew
ibm863
cp863
863
csibm863
ibm864
cp864
csibm864
ibm865
cp865
865
csibm865
ibm866
cp866
866
csibm866
ibm869
cp869
869
cp-gr
csibm869
ibm1026
cp1026
csibm1026
koi8-r
cskoi8r
koi8-u
big5-hkscs
ptcp154
csptcp154
pt154
cp154
utf-7
utf-16be
utf-16le
utf-16
utf-8
iso-8859-13
iso-8859-14
iso-ir-199
iso_8859-14:1998
iso_8859-14
latin8
iso-celtic
l8
iso-8859-15
iso_8859-15
iso-8859-16
iso-ir-226
iso_8859-16:2001
iso_8859-16
latin10
l10
gbk
cp936
ms936
gb18030
shift_jis
ms_kanji
csshiftjis
euc-jp
gb2312
big5
csbig5
windows-1250
windows-1251
windows-1252
windows-1253
windows-1254
windows-1255
windows-1256
windows-1257
windows-1258
tis-620
hz-gb-2312
]
end

View file

@ -1 +0,0 @@
require 'html5lib/filters/optionaltags'

View file

@ -1,2 +0,0 @@
require 'html5lib/serializer/htmlserializer'
require 'html5lib/serializer/xhtmlserializer'

View file

@ -26,15 +26,15 @@ def parse(opts, args)
exit(1) exit(1)
end end
require 'html5lib/treebuilders' require 'html5/treebuilders'
treebuilder = HTML5lib::TreeBuilders[opts.treebuilder] treebuilder = HTML5::TreeBuilders[opts.treebuilder]
if opts.output == :xml if opts.output == :xml
require 'html5lib/liberalxmlparser' require 'html5/liberalxmlparser'
p = HTML5lib::XHTMLParser.new(:tree=>treebuilder) p = HTML5::XHTMLParser.new(:tree=>treebuilder)
else else
require 'html5lib/html5parser' require 'html5/html5parser'
p = HTML5lib::HTMLParser.new(:tree=>treebuilder) p = HTML5::HTMLParser.new(:tree=>treebuilder)
end end
if opts.parsemethod == :parse if opts.parsemethod == :parse
@ -70,10 +70,10 @@ def printOutput(parser, document, opts)
when :xml when :xml
print document print document
when :html when :html
require 'html5lib/treewalkers' require 'html5/treewalkers'
tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document) tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
require 'html5lib/serializer' require 'html5/serializer'
puts HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer) puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
when :hilite when :hilite
print document.hilite print document.hilite
when :tree when :tree
@ -188,6 +188,10 @@ opts = OptionParser.new do |opts|
options.serializer[:escape_lt_in_attrs] = lt options.serializer[:escape_lt_in_attrs] = lt
end end
opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
options.serializer[:escape_rcdata] = rcdata
end
opts.separator "" opts.separator ""
opts.separator "Other Options:" opts.separator "Other Options:"

View file

@ -33,7 +33,6 @@ EUC-jp
#encoding #encoding
EUC-jp EUC-jp
#data #data
<!-- --> <!-- -->
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> <meta http-equiv="Content-Type" content="text/html; charset=utf-8">

View file

@ -92,7 +92,8 @@
{"description": "rcdata", {"description": "rcdata",
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]], "input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a<b>c&d"] "expected": ["<script>a<b>c&d"],
"xhtml": ["<script>a&lt;b&gt;c&amp;d"]
}, },
{"description": "doctype", {"description": "doctype",

View file

@ -49,6 +49,12 @@
"options": {"escape_lt_in_attrs": true}, "options": {"escape_lt_in_attrs": true},
"input": [["StartTag", "a", {"title": "a<b>c&d"}]], "input": [["StartTag", "a", {"title": "a<b>c&d"}]],
"expected": ["<a title=\"a&lt;b>c&amp;d\">"] "expected": ["<a title=\"a&lt;b>c&amp;d\">"]
},
{"description": "rcdata",
"options": {"escape_rcdata": true},
"input": [["StartTag", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a&lt;b&gt;c&amp;d"]
} }
]} ]}

View file

@ -135,7 +135,7 @@
{"description":"Entity without trailing semicolon (2)", {"description":"Entity without trailing semicolon (2)",
"input":"I'm &notin", "input":"I'm &notin",
"output":[["Character","I'm "], "ParseError", ["Character", ""]]}, "output":[["Character","I'm "], "ParseError", ["Character", "¬in"]]},
{"description":"Partial entity match at end of file", {"description":"Partial entity match at end of file",
"input":"I'm &no", "input":"I'm &no",
@ -151,6 +151,18 @@
{"description":"Hexadecimal entity in attribute", {"description":"Hexadecimal entity in attribute",
"input":"<h a='&#x3f;'></h>", "input":"<h a='&#x3f;'></h>",
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]} "output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]},
{"description":"Entity in attribute without semicolon ending in x",
"input":"<h a='&notx'>",
"output":["ParseError", ["StartTag", "h", {"a":"&notx"}]]},
{"description":"Entity in attribute without semicolon ending in 1",
"input":"<h a='&not1'>",
"output":["ParseError", ["StartTag", "h", {"a":"&not1"}]]},
{"description":"Entity in attribute without semicolon",
"input":"<h a='&COPY'>",
"output":["ParseError", ["StartTag", "h", {"a":"©"}]]}
]} ]}

View file

@ -42,19 +42,23 @@
{"description":"Numeric entity representing the NUL character", {"description":"Numeric entity representing the NUL character",
"input":"&#0000;", "input":"&#0000;",
"output":[["Character", "\uFFFD"]]}, "output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing the NUL character", {"description":"Hexadecimal entity representing the NUL character",
"input":"&#x0000;", "input":"&#x0000;",
"output":[["Character", "\uFFFD"]]}, "output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)", {"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#2225222;", "input":"&#2225222;",
"output":[["Character", "\uFFFD"]]}, "output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)", {"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#x1010FFFF;", "input":"&#x1010FFFF;",
"output":[["Character", "\uFFFD"]]}, "output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity pair representing a surrogate pair",
"input":"&#xD869;&#xDED6;",
"output":["ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"]]},
{"description":"Numeric entity representing a Windows-1252 'codepoint'", {"description":"Numeric entity representing a Windows-1252 'codepoint'",
"input":"&#137;", "input":"&#137;",
@ -118,7 +122,7 @@
{"description":"Null Byte Replacement", {"description":"Null Byte Replacement",
"input":"\u0000", "input":"\u0000",
"output":[["Character", "\ufffd"]]} "output":["ParseError", ["Character", "\ufffd"]]}
]} ]}

View file

@ -285,6 +285,7 @@ Line1<br>Line2<br>Line3<br>Line4
| <div> | <div>
| <b> | <b>
| <marquee> | <marquee>
| <p>
| "X" | "X"
#data #data
@ -330,6 +331,7 @@ Unexpected end of file
| <body> | <body>
| <p> | <p>
| <hr> | <hr>
| <p>
#data #data
<select><b><option><select><option></b></select>X <select><b><option><select><option></b></select>X
@ -1369,13 +1371,14 @@ unexpected EOF
<head></p><meta><p> <head></p><meta><p>
#errors #errors
6: missing document type declaration 6: missing document type declaration
10: unexpected p element end tag in head 10: unexpected p element end tag
#document #document
| <html> | <html>
| <head> | <head>
| <meta>
| <body> | <body>
| <p> | <p>
| <meta>
| <p>
#data #data
<head></html><meta><p> <head></html><meta><p>
@ -1485,6 +1488,7 @@ unexpected EOF
| <div> | <div>
| <b> | <b>
| <marquee> | <marquee>
| <p>
#data #data
<script></script></div><title></title><p><p> <script></script></div><title></title><p><p>
@ -1511,6 +1515,7 @@ unexpected EOF
| <body> | <body>
| <p> | <p>
| <hr> | <hr>
| <p>
#data #data
<select><b><option><select><option></b></select> <select><b><option><select><option></b></select>
@ -1807,6 +1812,7 @@ Unexpected EOF
| <head> | <head>
| <body> | <body>
| <br> | <br>
| <p>
#data #data
<table><tr></strong></b></em></i></u></strike></s></blink></tt></pre></big></small></font></select></h1></h2></h3></h4></h5></h6></body></br></a></img></title></span></style></script></table></th></td></tr></frame></area></link></param></hr></input></col></base></meta></basefont></bgsound></embed></spacer></p></dd></dt></caption></colgroup></tbody></tfoot></thead></address></blockquote></center></dir></div></dl></fieldset></listing></menu></ol></ul></li></nobr></wbr></form></button></marquee></object></html></frameset></head></iframe></image></isindex></noembed></noframes></noscript></optgroup></option></plaintext></textarea> <table><tr></strong></b></em></i></u></strike></s></blink></tt></pre></big></small></font></select></h1></h2></h3></h4></h5></h6></body></br></a></img></title></span></style></script></table></th></td></tr></frame></area></link></param></hr></input></col></base></meta></basefont></bgsound></embed></spacer></p></dd></dt></caption></colgroup></tbody></tfoot></thead></address></blockquote></center></dir></div></dl></fieldset></listing></menu></ol></ul></li></nobr></wbr></form></button></marquee></object></html></frameset></head></iframe></image></isindex></noembed></noframes></noscript></optgroup></option></plaintext></textarea>
@ -1928,3 +1934,4 @@ Unexpected EOF
| <table> | <table>
| <tbody> | <tbody>
| <tr> | <tr>
| <p>

View file

@ -777,3 +777,4 @@ Unexpected </p> end tag.
| <tbody> | <tbody>
| <tr> | <tr>
| <td> | <td>
| <p>

View file

@ -61,7 +61,6 @@ No DOCTYPE
#data #data
<!DOCTYPE htML><html><head></head><body><pre> <!DOCTYPE htML><html><head></head><body><pre>
foo</pre></body></html> foo</pre></body></html>
#errors #errors
#document #document
@ -72,10 +71,22 @@ foo</pre></body></html>
| <pre> | <pre>
| "foo" | "foo"
#data #data
<!DOCTYPE htML><html><head></head><body><pre> <!DOCTYPE htML><html><head></head><body><pre>
foo</pre></body></html>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <pre>
| "
foo"
#data
<!DOCTYPE htML><html><head></head><body><pre>
foo foo
</pre></body></html> </pre></body></html>
#errors #errors
@ -183,7 +194,6 @@ y</pre></body></html>
#data #data
<!DOCTYPE htML><textarea> <!DOCTYPE htML><textarea>
foo</textarea> foo</textarea>
#errors #errors
#document #document
@ -194,6 +204,20 @@ foo</textarea>
| <textarea> | <textarea>
| "foo" | "foo"
#data
<!DOCTYPE htML><textarea>
foo</textarea>
#errors
#document
| <!DOCTYPE htML>
| <html>
| <head>
| <body>
| <textarea>
| "
foo"
#data #data
<!DOCTYPE htML><html><head></head><body><ul><li><div><p><li></ul></body></html> <!DOCTYPE htML><html><head></head><body><ul><li><div><p><li></ul></body></html>
#errors #errors

View file

@ -1,37 +1,49 @@
#data #data
direct div content direct div content
#errors #errors
#document-fragment div #document-fragment
div
#document
| "direct div content" | "direct div content"
#data #data
direct textarea content direct textarea content
#errors #errors
#document-fragment textarea #document-fragment
textarea
#document
| "direct textarea content" | "direct textarea content"
#data #data
textarea content with <em>pseudo</em> <foo>markup textarea content with <em>pseudo</em> <foo>markup
#errors #errors
#document-fragment textarea #document-fragment
textarea
#document
| "textarea content with <em>pseudo</em> <foo>markup" | "textarea content with <em>pseudo</em> <foo>markup"
#data #data
this is &#x0043;DATA inside a <style> element this is &#x0043;DATA inside a <style> element
#errors #errors
#document-fragment style #document-fragment
style
#document
| "this is &#x0043;DATA inside a <style> element" | "this is &#x0043;DATA inside a <style> element"
#data #data
</plaintext> </plaintext>
#errors #errors
#document-fragment plaintext #document-fragment
plaintext
#document
| "</plaintext>" | "</plaintext>"
#data #data
setting html's innerHTML setting html's innerHTML
#errors #errors
#document-fragment html #document-fragment
html
#document
| <head> | <head>
| <body> | <body>
| "setting html's innerHTML" | "setting html's innerHTML"
@ -39,6 +51,8 @@ setting html's innerHTML
#data #data
<title>setting head's innerHTML</title> <title>setting head's innerHTML</title>
#errors #errors
#document-fragment head #document-fragment
head
#document
| <title> | <title>
| "setting head's innerHTML" | "setting head's innerHTML"

View file

@ -27,3 +27,41 @@
| <head> | <head>
| <body> | <body>
| <meta> | <meta>
#data
<!doctype HTml><form><div></form><div>
#errors
Form end tag ignored.
Unexpected end of file.
#document
| <!DOCTYPE HTml>
| <html>
| <head>
| <body>
| <form>
| <div>
| <div>
#data
<!doctype HTml><title>&amp;</title>
#errors
Unexpected end of file.
#document
| <!DOCTYPE HTml>
| <html>
| <head>
| <title>
| "&"
| <body>
#data
<!doctype HTml><title><!--&amp;--></title>
#errors
Unexpected end of file.
#document
| <!DOCTYPE HTml>
| <html>
| <head>
| <title>
| "<!--&amp;-->"
| <body>

View file

@ -1,9 +1,9 @@
require 'test/unit' require 'test/unit'
HTML5LIB_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__)))) HTML5_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
if File.exists?(File.join(HTML5LIB_BASE, 'testdata')) if File.exists?(File.join(HTML5_BASE, 'testdata'))
TESTDATA_DIR = File.join(HTML5LIB_BASE, 'testdata') TESTDATA_DIR = File.join(HTML5_BASE, 'testdata')
else else
TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata') TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
end end
@ -12,7 +12,7 @@ $:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
$:.unshift File.dirname(__FILE__) $:.unshift File.dirname(__FILE__)
def html5lib_test_files(subdirectory) def html5_test_files(subdirectory)
Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')] Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
end end
@ -30,42 +30,8 @@ rescue LoadError
end end
end end
module HTML5lib module HTML5
module TestSupport module TestSupport
def self.startswith?(a, b)
b[0... a.length] == a
end
def self.parseTestcase(data)
innerHTML = nil
input = []
output = []
errors = []
currentList = input
data.split(/\n/).each do |line|
if !line.empty? and !startswith?("#errors", line) and
!startswith?("#document", line) and
!startswith?("#data", line) and
!startswith?("#document-fragment", line)
if currentList == output and startswith?("|", line)
currentList.push(line[2..-1])
else
currentList.push(line)
end
elsif line == "#errors"
currentList = errors
elsif line == "#document" or startswith?("#document-fragment", line)
if startswith?("#document-fragment", line)
innerHTML = line[19..-1]
raise AssertionError unless innerHTML
end
currentList = output
end
end
return innerHTML, input.join("\n"), output.join("\n"), errors
end
# convert the output of str(document) to the format used in the testcases # convert the output of str(document) to the format used in the testcases
def convertTreeDump(treedump) def convertTreeDump(treedump)
treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n") treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
@ -77,5 +43,39 @@ module HTML5lib
end end
end end
class TestData
include Enumerable
def initialize(filename, sections)
@f = open(filename)
@sections = sections
end
def each
data = {}
key=nil
@f.each_line do |line|
if line[0] == ?# and @sections.include?(line[1..-2])
heading = line[1..-2]
if data.any? and heading == @sections[0]
data[key].chomp! #Remove trailing newline
yield normaliseOutput(data)
data = {}
end
key = heading
data[key]=""
elsif key
data[key] += line
end
end
yield normaliseOutput(data) if data
end
def normaliseOutput(data)
#Remove trailing newlines
data.keys.each { |key| data[key].chomp! }
@sections.map {|heading| data[heading]}
end
end
end end
end end

View file

@ -1,8 +1,10 @@
require File.join(File.dirname(__FILE__), 'preamble') require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/inputstream' require 'html5/inputstream'
class Html5EncodingTestCase < Test::Unit::TestCase class Html5EncodingTestCase < Test::Unit::TestCase
include HTML5
include TestSupport
begin begin
require 'rubygems' require 'rubygems'
@ -10,23 +12,21 @@ class Html5EncodingTestCase < Test::Unit::TestCase
def test_chardet def test_chardet
file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r') file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r')
stream = HTML5lib::HTMLInputStream.new(file, :chardet => true) stream = HTML5::HTMLInputStream.new(file, :chardet => true)
assert_equal 'big5', stream.char_encoding.downcase assert_equal 'big5', stream.char_encoding.downcase
rescue LoadError rescue LoadError
puts "chardet not found, skipping chardet tests" puts "chardet not found, skipping chardet tests"
end end
end end
html5lib_test_files('encoding').each do |test_file| html5_test_files('encoding').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '').tr('-', '') test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
File.read(test_file).split("#data\n").each_with_index do |data, index| TestData.new(test_file, %w(data encoding)).
next if data.empty? each_with_index do |(input, encoding), index|
input, encoding = data.split(/\n#encoding\s+/, 2)
encoding = encoding.split[0]
define_method 'test_%s_%d' % [ test_name, index + 1 ] do define_method 'test_%s_%d' % [ test_name, index + 1 ] do
stream = HTML5lib::HTMLInputStream.new(input, :chardet => false) stream = HTML5::HTMLInputStream.new(input, :chardet => false)
assert_equal encoding.downcase, stream.char_encoding.downcase, input assert_equal encoding.downcase, stream.char_encoding.downcase, input
end end
end end

View file

@ -1,23 +1,23 @@
require File.join(File.dirname(__FILE__), 'preamble') require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/liberalxmlparser' require 'html5/liberalxmlparser'
XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/ XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
SORTATTRS = '<#{$1+$2.split.sort.join(' ')+$3}>'
def assert_xml_equal(input, expected=nil, parser=HTML5lib::XMLParser) def assert_xml_equal(input, expected=nil, parser=HTML5::XMLParser)
sortattrs = proc {"<#{$1+$2.split.sort.join(' ')+$3}>"}
document = parser.parse(input.chomp).root document = parser.parse(input.chomp).root
if not expected if not expected
expected = input.chomp.gsub(XMLELEM,SORTATTRS) expected = input.chomp.gsub(XMLELEM,&sortattrs)
expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')} expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,SORTATTRS) output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,&sortattrs)
assert_equal(expected, output) assert_equal(expected, output)
else else
assert_equal(expected, document.to_s.gsub(/'/,'"')) assert_equal(expected, document.to_s.gsub(/'/,'"'))
end end
end end
def assert_xhtml_equal(input, expected=nil, parser=HTML5lib::XHTMLParser) def assert_xhtml_equal(input, expected=nil, parser=HTML5::XHTMLParser)
assert_xml_equal(input, expected, parser) assert_xml_equal(input, expected, parser)
end end
@ -34,10 +34,10 @@ class BasicXhtml5Test < Test::Unit::TestCase
def test_title_body_named_charref def test_title_body_named_charref
assert_xhtml_equal( assert_xhtml_equal(
'<title>mdash</title>A &mdash B', '<title>ntilde</title>A &ntilde B',
'<html xmlns="http://www.w3.org/1999/xhtml">' + '<html xmlns="http://www.w3.org/1999/xhtml">' +
'<head><title>mdash</title></head>' + '<head><title>ntilde</title></head>' +
'<body>A '+ [0x2014].pack('U') + ' B</body>' + '<body>A '+ [0xF1].pack('U') + ' B</body>' +
'</html>') '</html>')
end end
end end
@ -193,20 +193,71 @@ EOX
def test_br def test_br
assert_xhtml_equal <<EOX1 assert_xhtml_equal <<EOX1
<html xmlns="http://www.w3.org/1999/xhtml"> <html xmlns="http://www.w3.org/1999/xhtml">
<head><title>XLINK</title></head> <head><title>BR</title></head>
<body> <body>
<br/> <br/>
</body></html> </body></html>
EOX1 EOX1
end end
def xtest_strong def test_strong
assert_xhtml_equal <<EOX assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml"> <html xmlns="http://www.w3.org/1999/xhtml">
<head><title>XLINK</title></head> <head><title>STRONG</title></head>
<body> <body>
<strong></strong> <strong></strong>
</body></html> </body></html>
EOX EOX
end end
def test_script
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>SCRIPT</title></head>
<body>
<script>1 &lt; 2 &amp; 3</script>
</body></html>
EOX
end
def test_script_src
assert_xhtml_equal <<EOX1, <<EOX2.strip
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>SCRIPT</title><script src="http://example.com"/></head>
<body>
<script>1 &lt; 2 &amp; 3</script>
</body></html>
EOX1
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>SCRIPT</title><script src="http://example.com"></script></head>
<body>
<script>1 &lt; 2 &amp; 3</script>
</body></html>
EOX2
end
def test_title
assert_xhtml_equal <<EOX
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>1 &lt; 2 &amp; 3</title></head>
<body>
</body></html>
EOX
end
def test_prolog
assert_xhtml_equal <<EOX1, <<EOX2.strip
<?xml version="1.0" encoding="UTF-8" ?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>PROLOG</title></head>
<body>
</body></html>
EOX1
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>PROLOG</title></head>
<body>
</body></html>
EOX2
end
end end

View file

@ -1,7 +1,7 @@
require File.join(File.dirname(__FILE__), 'preamble') require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/treebuilders' require 'html5/treebuilders'
require 'html5lib/html5parser' require 'html5/html5parser'
$tree_types_to_test = ['simpletree', 'rexml'] $tree_types_to_test = ['simpletree', 'rexml']
@ -18,18 +18,17 @@ puts 'Testing tree builders: ' + $tree_types_to_test * ', '
class Html5ParserTestCase < Test::Unit::TestCase class Html5ParserTestCase < Test::Unit::TestCase
include HTML5lib include HTML5
include TestSupport include TestSupport
html5lib_test_files('tree-construction').each do |test_file| html5_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '') test_name = File.basename(test_file).sub('.dat', '')
File.read(test_file).split("#data\n").each_with_index do |data, index| TestData.new(test_file, %w(data errors document-fragment document)).
next if data.empty? each_with_index do |(input, errors, innerHTML, expected), index|
innerHTML, input, expected_output, expected_errors = expected = expected.gsub("\n| ","\n")[2..-1]
TestSupport.parseTestcase(data)
$tree_types_to_test.each do |tree_name| $tree_types_to_test.each do |tree_name|
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
@ -44,9 +43,9 @@ class Html5ParserTestCase < Test::Unit::TestCase
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document)) actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
assert_equal sortattrs(expected_output), sortattrs(actual_output), [ assert_equal sortattrs(expected), sortattrs(actual_output), [
'', 'Input:', input, '', 'Input:', input,
'', 'Expected:', expected_output, '', 'Expected:', expected,
'', 'Recieved:', actual_output '', 'Recieved:', actual_output
].join("\n") ].join("\n")
@ -54,9 +53,9 @@ class Html5ParserTestCase < Test::Unit::TestCase
actual_errors = parser.errors.map do |(line, col), message| actual_errors = parser.errors.map do |(line, col), message|
'Line: %i Col: %i %s' % [line, col, message] 'Line: %i Col: %i %s' % [line, col, message]
end end
assert_equal expected_errors.length, parser.errors.length, [ assert_equal errors.length, parser.errors.length, [
'Input', input + "\n", 'Input', input + "\n",
'Expected errors:', expected_errors.join("\n"), 'Expected errors:', errors.join("\n"),
'Actual errors:', actual_errors.join("\n") 'Actual errors:', actual_errors.join("\n")
].join("\n") ].join("\n")
end end

View file

@ -2,14 +2,14 @@
require File.join(File.dirname(__FILE__), 'preamble') require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser' require 'html5/html5parser'
require 'html5lib/liberalxmlparser' require 'html5/liberalxmlparser'
require 'html5lib/treewalkers' require 'html5/treewalkers'
require 'html5lib/serializer' require 'html5/serializer'
require 'html5lib/sanitizer' require 'html5/sanitizer'
class SanitizeTest < Test::Unit::TestCase class SanitizeTest < Test::Unit::TestCase
include HTML5lib include HTML5
def sanitize_xhtml stream def sanitize_xhtml stream
XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).to_s XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).to_s
@ -131,7 +131,7 @@ class SanitizeTest < Test::Unit::TestCase
# check_sanitization(input, output, output, output) # check_sanitization(input, output, output, output)
# end # end
html5lib_test_files('sanitizer').each do |filename| html5_test_files('sanitizer').each do |filename|
JSON::parse(open(filename).read).each do |test| JSON::parse(open(filename).read).each do |test|
define_method "test_#{test['name']}" do define_method "test_#{test['name']}" do
check_sanitization( check_sanitization(

View file

@ -1,13 +1,13 @@
require File.join(File.dirname(__FILE__), 'preamble') require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser' require 'html5/html5parser'
require 'html5lib/serializer' require 'html5/serializer'
require 'html5lib/treewalkers' require 'html5/treewalkers'
#Run the serialize error checks #Run the serialize error checks
checkSerializeErrors = false checkSerializeErrors = false
class JsonWalker < HTML5lib::TreeWalkers::Base class JsonWalker < HTML5::TreeWalkers::Base
def each def each
@tree.each do |token| @tree.each do |token|
case token[0] case token[0]
@ -31,7 +31,7 @@ class JsonWalker < HTML5lib::TreeWalkers::Base
end end
class Html5SerializeTestcase < Test::Unit::TestCase class Html5SerializeTestcase < Test::Unit::TestCase
html5lib_test_files('serializer').each do |filename| html5_test_files('serializer').each do |filename|
test_name = File.basename(filename).sub('.test', '') test_name = File.basename(filename).sub('.test', '')
tests = JSON::parse(open(filename).read) tests = JSON::parse(open(filename).read)
tests['tests'].each_with_index do |test, index| tests['tests'].each_with_index do |test, index|
@ -41,7 +41,7 @@ class Html5SerializeTestcase < Test::Unit::TestCase
test["options"][:encoding] = test["options"]["encoding"] test["options"][:encoding] = test["options"]["encoding"]
end end
result = HTML5lib::HTMLSerializer. result = HTML5::HTMLSerializer.
serialize(JsonWalker.new(test["input"]), (test["options"] || {})) serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
expected = test["expected"] expected = test["expected"]
if expected.length == 1 if expected.length == 1
@ -52,7 +52,7 @@ class Html5SerializeTestcase < Test::Unit::TestCase
return if test_name == 'optionaltags' return if test_name == 'optionaltags'
result = HTML5lib::XHTMLSerializer. result = HTML5::XHTMLSerializer.
serialize(JsonWalker.new(test["input"]), (test["options"] || {})) serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
expected = test["xhtml"] || test["expected"] expected = test["xhtml"] || test["expected"]
if expected.length == 1 if expected.length == 1

View file

@ -1,9 +1,9 @@
require File.join(File.dirname(__FILE__), 'preamble') require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/inputstream' require 'html5/inputstream'
class HTMLInputStreamTest < Test::Unit::TestCase class HTMLInputStreamTest < Test::Unit::TestCase
include HTML5lib include HTML5
def test_char_ascii def test_char_ascii
stream = HTMLInputStream.new("'", :encoding=>'ascii') stream = HTMLInputStream.new("'", :encoding=>'ascii')

View file

@ -1,6 +1,6 @@
require File.join(File.dirname(__FILE__), 'preamble') require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/tokenizer' require 'html5/tokenizer'
require 'tokenizer_test_parser' require 'tokenizer_test_parser'
@ -36,7 +36,7 @@ class Html5TokenizerTestCase < Test::Unit::TestCase
'' ] * "\n" '' ] * "\n"
assert_nothing_raised message do assert_nothing_raised message do
tokenizer = HTML5lib::HTMLTokenizer.new(data['input']) tokenizer = HTML5::HTMLTokenizer.new(data['input'])
tokenizer.contentModelFlag = content_model_flag.to_sym tokenizer.contentModelFlag = content_model_flag.to_sym
@ -53,7 +53,7 @@ class Html5TokenizerTestCase < Test::Unit::TestCase
end end
end end
html5lib_test_files('tokenizer').each do |test_file| html5_test_files('tokenizer').each do |test_file|
test_name = File.basename(test_file).sub('.test', '') test_name = File.basename(test_file).sub('.test', '')
tests = JSON.parse(File.read(test_file))['tests'] tests = JSON.parse(File.read(test_file))['tests']

View file

@ -1,25 +1,25 @@
require File.join(File.dirname(__FILE__), 'preamble') require File.join(File.dirname(__FILE__), 'preamble')
require 'html5lib/html5parser' require 'html5/html5parser'
require 'html5lib/treewalkers' require 'html5/treewalkers'
require 'html5lib/treebuilders' require 'html5/treebuilders'
$tree_types_to_test = { $tree_types_to_test = {
'simpletree' => 'simpletree' =>
{:builder => HTML5lib::TreeBuilders['simpletree'], {:builder => HTML5::TreeBuilders['simpletree'],
:walker => HTML5lib::TreeWalkers['simpletree']}, :walker => HTML5::TreeWalkers['simpletree']},
'rexml' => 'rexml' =>
{:builder => HTML5lib::TreeBuilders['rexml'], {:builder => HTML5::TreeBuilders['rexml'],
:walker => HTML5lib::TreeWalkers['rexml']}, :walker => HTML5::TreeWalkers['rexml']},
'hpricot' => 'hpricot' =>
{:builder => HTML5lib::TreeBuilders['hpricot'], {:builder => HTML5::TreeBuilders['hpricot'],
:walker => HTML5lib::TreeWalkers['hpricot']}, :walker => HTML5::TreeWalkers['hpricot']},
} }
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', ' puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
class TestTreeWalkers < Test::Unit::TestCase class TestTreeWalkers < Test::Unit::TestCase
include HTML5lib::TestSupport include HTML5::TestSupport
def concatenateCharacterTokens(tokens) def concatenateCharacterTokens(tokens)
charactersToken = nil charactersToken = nil
@ -70,22 +70,21 @@ class TestTreeWalkers < Test::Unit::TestCase
return output.join("\n") return output.join("\n")
end end
html5lib_test_files('tree-construction').each do |test_file| html5_test_files('tree-construction').each do |test_file|
test_name = File.basename(test_file).sub('.dat', '') test_name = File.basename(test_file).sub('.dat', '')
next if test_name == 'tests5' # TODO next if test_name == 'tests5' # TODO
File.read(test_file).split("#data\n").each_with_index do |data, index| TestData.new(test_file, %w(data errors document-fragment document)).
next if data.empty? each_with_index do |(input, errors, innerHTML, expected), index|
innerHTML, input, expected_output, expected_errors = expected = expected.gsub("\n| ","\n")[2..-1]
HTML5lib::TestSupport::parseTestcase(data)
$tree_types_to_test.each do |tree_name, tree_class| $tree_types_to_test.each do |tree_name, tree_class|
define_method "test_#{test_name}_#{index}_#{tree_name}" do define_method "test_#{test_name}_#{index}_#{tree_name}" do
parser = HTML5lib::HTMLParser.new(:tree => tree_class[:builder]) parser = HTML5::HTMLParser.new(:tree => tree_class[:builder])
if innerHTML if innerHTML
parser.parseFragment(input, innerHTML) parser.parseFragment(input, innerHTML)
@ -97,7 +96,7 @@ class TestTreeWalkers < Test::Unit::TestCase
begin begin
output = sortattrs(convertTokens(tree_class[:walker].new(document))) output = sortattrs(convertTokens(tree_class[:walker].new(document)))
expected = sortattrs(expected_output) expected = sortattrs(expected)
assert_equal expected, output, [ assert_equal expected, output, [
'', 'Input:', input, '', 'Input:', input,
'', 'Expected:', expected, '', 'Expected:', expected,

View file

@ -1,4 +1,4 @@
require 'html5lib/constants' require 'html5/constants'
class TokenizerTestParser class TokenizerTestParser
def initialize(tokenizer) def initialize(tokenizer)
@ -27,7 +27,7 @@ class TokenizerTestParser
end end
def processEmptyTag(token) def processEmptyTag(token)
if not HTML5lib::VOID_ELEMENTS.include? token[:name] if not HTML5::VOID_ELEMENTS.include? token[:name]
@outputTokens.push("ParseError") @outputTokens.push("ParseError")
end end
@outputTokens.push(["StartTag", token[:name], token[:data]]) @outputTokens.push(["StartTag", token[:name], token[:data]])

View file

@ -28,6 +28,7 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
class LineSource class LineSource
include MaRuKu::Strings include MaRuKu::Strings
attr_reader :parent
def initialize(lines, parent=nil, parent_offset=nil) def initialize(lines, parent=nil, parent_offset=nil)
raise "NIL lines? " if not lines raise "NIL lines? " if not lines

View file

@ -65,22 +65,8 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
when :ald when :ald
output.push read_ald(src) output.push read_ald(src)
when :text when :text
if src.cur_line =~ MightBeTableHeader and # paragraph, or table, or definition list
(src.next_line && src.next_line =~ TableSeparator) read_text_material(src, output)
output.push read_table(src)
elsif [:header1,:header2].include? src.next_line.md_type
output.push read_header12(src)
elsif eventually_comes_a_def_list(src)
definition = read_definition(src)
if output.last.kind_of?(MDElement) &&
output.last.node_type == :definition_list then
output.last.children << definition
else
output.push md_el(:definition_list, [definition])
end
else # Start of a paragraph
output.push read_paragraph(src)
end
when :header2, :hrule when :header2, :hrule
# hrule # hrule
src.shift_line src.shift_line
@ -102,7 +88,12 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
when :raw_html; e = read_raw_html(src); output << e if e when :raw_html; e = read_raw_html(src); output << e if e
when :footnote_text; output.push read_footnote_text(src) when :footnote_text; output.push read_footnote_text(src)
when :ref_definition; read_ref_definition(src, output) when :ref_definition;
if src.parent && (src.cur_index == 0)
read_text_material(src, output)
else
read_ref_definition(src, output)
end
when :abbreviation; output.push read_abbreviation(src) when :abbreviation; output.push read_abbreviation(src)
when :xml_instr; read_xml_instruction(src, output) when :xml_instr; read_xml_instruction(src, output)
when :metadata; when :metadata;
@ -149,6 +140,24 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
output output
end end
def read_text_material(src, output)
if src.cur_line =~ MightBeTableHeader and
(src.next_line && src.next_line =~ TableSeparator)
output.push read_table(src)
elsif [:header1,:header2].include? src.next_line.md_type
output.push read_header12(src)
elsif eventually_comes_a_def_list(src)
definition = read_definition(src)
if output.last.kind_of?(MDElement) &&
output.last.node_type == :definition_list then
output.last.children << definition
else
output.push md_el(:definition_list, [definition])
end
else # Start of a paragraph
output.push read_paragraph(src)
end
end
def read_ald(src) def read_ald(src)
@ -274,9 +283,9 @@ module MaRuKu; module In; module Markdown; module BlockLevelParser
item_type = src.cur_line.md_type item_type = src.cur_line.md_type
first = src.shift_line first = src.shift_line
# Ugly things going on inside `read_indented_content`
indentation = spaces_before_first_char(first) indentation = spaces_before_first_char(first)
break_list = [:ulist, :olist, :ial] break_list = [:ulist, :olist, :ial]
# Ugly things going on inside `read_indented_content`
lines, want_my_paragraph = lines, want_my_paragraph =
read_indented_content(src,indentation, break_list, item_type) read_indented_content(src,indentation, break_list, item_type)