0f6889e09f
Fix Diego Restrepo's bug (see Rev 184). Update to latest HTML5lib.
1047 lines
30 KiB
Ruby
Executable file
1047 lines
30 KiB
Ruby
Executable file
module HTML5
|
|
|
|
class EOF < Exception; end
|
|
|
|
def self._(str); str end
|
|
|
|
CONTENT_MODEL_FLAGS = [
|
|
:PCDATA,
|
|
:RCDATA,
|
|
:CDATA,
|
|
:PLAINTEXT
|
|
]
|
|
|
|
SCOPING_ELEMENTS = %w[
|
|
button
|
|
caption
|
|
html
|
|
marquee
|
|
object
|
|
table
|
|
td
|
|
th
|
|
]
|
|
|
|
FORMATTING_ELEMENTS = %w[
|
|
a
|
|
b
|
|
big
|
|
em
|
|
font
|
|
i
|
|
nobr
|
|
s
|
|
small
|
|
strike
|
|
strong
|
|
tt
|
|
u
|
|
]
|
|
|
|
SPECIAL_ELEMENTS = %w[
|
|
address
|
|
area
|
|
base
|
|
basefont
|
|
bgsound
|
|
blockquote
|
|
body
|
|
br
|
|
center
|
|
col
|
|
colgroup
|
|
dd
|
|
dir
|
|
div
|
|
dl
|
|
dt
|
|
embed
|
|
fieldset
|
|
form
|
|
frame
|
|
frameset
|
|
h1
|
|
h2
|
|
h3
|
|
h4
|
|
h5
|
|
h6
|
|
head
|
|
hr
|
|
iframe
|
|
image
|
|
img
|
|
input
|
|
isindex
|
|
li
|
|
link
|
|
listing
|
|
menu
|
|
meta
|
|
noembed
|
|
noframes
|
|
noscript
|
|
ol
|
|
optgroup
|
|
option
|
|
p
|
|
param
|
|
plaintext
|
|
pre
|
|
script
|
|
select
|
|
spacer
|
|
style
|
|
tbody
|
|
textarea
|
|
tfoot
|
|
thead
|
|
title
|
|
tr
|
|
ul
|
|
wbr
|
|
]
|
|
|
|
SPACE_CHARACTERS = %W[
|
|
\t
|
|
\n
|
|
\x0B
|
|
\x0C
|
|
\x20
|
|
\r
|
|
]
|
|
|
|
TABLE_INSERT_MODE_ELEMENTS = %w[
|
|
table
|
|
tbody
|
|
tfoot
|
|
thead
|
|
tr
|
|
]
|
|
|
|
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
|
|
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
|
|
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
|
|
DIGITS = '0'..'9'
|
|
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
|
|
|
|
# Heading elements need to be ordered
|
|
HEADING_ELEMENTS = %w[
|
|
h1
|
|
h2
|
|
h3
|
|
h4
|
|
h5
|
|
h6
|
|
]
|
|
|
|
# XXX What about event-source and command?
|
|
VOID_ELEMENTS = %w[
|
|
base
|
|
link
|
|
meta
|
|
hr
|
|
br
|
|
img
|
|
embed
|
|
param
|
|
area
|
|
col
|
|
input
|
|
]
|
|
|
|
CDATA_ELEMENTS = %w[title textarea]
|
|
|
|
RCDATA_ELEMENTS = %w[
|
|
style
|
|
script
|
|
xmp
|
|
iframe
|
|
noembed
|
|
noframes
|
|
noscript
|
|
]
|
|
|
|
BOOLEAN_ATTRIBUTES = {
|
|
:global => %w[irrelevant],
|
|
'style' => %w[scoped],
|
|
'img' => %w[ismap],
|
|
'audio' => %w[autoplay controls],
|
|
'video' => %w[autoplay controls],
|
|
'script' => %w[defer async],
|
|
'details' => %w[open],
|
|
'datagrid' => %w[multiple disabled],
|
|
'command' => %w[hidden disabled checked default],
|
|
'menu' => %w[autosubmit],
|
|
'fieldset' => %w[disabled readonly],
|
|
'option' => %w[disabled readonly selected],
|
|
'optgroup' => %w[disabled readonly],
|
|
'button' => %w[disabled autofocus],
|
|
'input' => %w[disabled readonly required autofocus checked ismap],
|
|
'select' => %w[disabled readonly autofocus multiple],
|
|
'output' => %w[disabled readonly]
|
|
|
|
}
|
|
|
|
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
|
|
ENTITIES_WINDOWS1252 = [
|
|
8364, # 0x80 0x20AC EURO SIGN
|
|
65533, # 0x81 UNDEFINED
|
|
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
|
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
|
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
|
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
|
8224, # 0x86 0x2020 DAGGER
|
|
8225, # 0x87 0x2021 DOUBLE DAGGER
|
|
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
|
8240, # 0x89 0x2030 PER MILLE SIGN
|
|
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
|
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
|
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
|
65533, # 0x8D UNDEFINED
|
|
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
|
65533, # 0x8F UNDEFINED
|
|
65533, # 0x90 UNDEFINED
|
|
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
|
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
|
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
|
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
|
8226, # 0x95 0x2022 BULLET
|
|
8211, # 0x96 0x2013 EN DASH
|
|
8212, # 0x97 0x2014 EM DASH
|
|
732, # 0x98 0x02DC SMALL TILDE
|
|
8482, # 0x99 0x2122 TRADE MARK SIGN
|
|
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
|
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
|
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
|
65533, # 0x9D UNDEFINED
|
|
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
|
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
|
]
|
|
|
|
# ENTITIES was generated from Python using the following code:
|
|
#
|
|
# import constants
|
|
# entities = constants.entities.items()
|
|
# entities.sort()
|
|
# list = [ ' '.join([repr(entity), '=>', ord(value)<128 and
|
|
# repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
|
|
# for entity, value in entities]
|
|
# print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }'
|
|
|
|
ENTITIES = {
|
|
'AElig' => "\xc3\x86",
|
|
'AElig;' => "\xc3\x86",
|
|
'AMP' => '&',
|
|
'AMP;' => '&',
|
|
'Aacute' => "\xc3\x81",
|
|
'Aacute;' => "\xc3\x81",
|
|
'Acirc' => "\xc3\x82",
|
|
'Acirc;' => "\xc3\x82",
|
|
'Agrave' => "\xc3\x80",
|
|
'Agrave;' => "\xc3\x80",
|
|
'Alpha;' => "\xce\x91",
|
|
'Aring' => "\xc3\x85",
|
|
'Aring;' => "\xc3\x85",
|
|
'Atilde' => "\xc3\x83",
|
|
'Atilde;' => "\xc3\x83",
|
|
'Auml' => "\xc3\x84",
|
|
'Auml;' => "\xc3\x84",
|
|
'Beta;' => "\xce\x92",
|
|
'COPY' => "\xc2\xa9",
|
|
'COPY;' => "\xc2\xa9",
|
|
'Ccedil' => "\xc3\x87",
|
|
'Ccedil;' => "\xc3\x87",
|
|
'Chi;' => "\xce\xa7",
|
|
'Dagger;' => "\xe2\x80\xa1",
|
|
'Delta;' => "\xce\x94",
|
|
'ETH' => "\xc3\x90",
|
|
'ETH;' => "\xc3\x90",
|
|
'Eacute' => "\xc3\x89",
|
|
'Eacute;' => "\xc3\x89",
|
|
'Ecirc' => "\xc3\x8a",
|
|
'Ecirc;' => "\xc3\x8a",
|
|
'Egrave' => "\xc3\x88",
|
|
'Egrave;' => "\xc3\x88",
|
|
'Epsilon;' => "\xce\x95",
|
|
'Eta;' => "\xce\x97",
|
|
'Euml' => "\xc3\x8b",
|
|
'Euml;' => "\xc3\x8b",
|
|
'GT' => '>',
|
|
'GT;' => '>',
|
|
'Gamma;' => "\xce\x93",
|
|
'Iacute' => "\xc3\x8d",
|
|
'Iacute;' => "\xc3\x8d",
|
|
'Icirc' => "\xc3\x8e",
|
|
'Icirc;' => "\xc3\x8e",
|
|
'Igrave' => "\xc3\x8c",
|
|
'Igrave;' => "\xc3\x8c",
|
|
'Iota;' => "\xce\x99",
|
|
'Iuml' => "\xc3\x8f",
|
|
'Iuml;' => "\xc3\x8f",
|
|
'Kappa;' => "\xce\x9a",
|
|
'LT' => '<',
|
|
'LT;' => '<',
|
|
'Lambda;' => "\xce\x9b",
|
|
'Mu;' => "\xce\x9c",
|
|
'Ntilde' => "\xc3\x91",
|
|
'Ntilde;' => "\xc3\x91",
|
|
'Nu;' => "\xce\x9d",
|
|
'OElig;' => "\xc5\x92",
|
|
'Oacute' => "\xc3\x93",
|
|
'Oacute;' => "\xc3\x93",
|
|
'Ocirc' => "\xc3\x94",
|
|
'Ocirc;' => "\xc3\x94",
|
|
'Ograve' => "\xc3\x92",
|
|
'Ograve;' => "\xc3\x92",
|
|
'Omega;' => "\xce\xa9",
|
|
'Omicron;' => "\xce\x9f",
|
|
'Oslash' => "\xc3\x98",
|
|
'Oslash;' => "\xc3\x98",
|
|
'Otilde' => "\xc3\x95",
|
|
'Otilde;' => "\xc3\x95",
|
|
'Ouml' => "\xc3\x96",
|
|
'Ouml;' => "\xc3\x96",
|
|
'Phi;' => "\xce\xa6",
|
|
'Pi;' => "\xce\xa0",
|
|
'Prime;' => "\xe2\x80\xb3",
|
|
'Psi;' => "\xce\xa8",
|
|
'QUOT' => '"',
|
|
'QUOT;' => '"',
|
|
'REG' => "\xc2\xae",
|
|
'REG;' => "\xc2\xae",
|
|
'Rho;' => "\xce\xa1",
|
|
'Scaron;' => "\xc5\xa0",
|
|
'Sigma;' => "\xce\xa3",
|
|
'THORN' => "\xc3\x9e",
|
|
'THORN;' => "\xc3\x9e",
|
|
'TRADE;' => "\xe2\x84\xa2",
|
|
'Tau;' => "\xce\xa4",
|
|
'Theta;' => "\xce\x98",
|
|
'Uacute' => "\xc3\x9a",
|
|
'Uacute;' => "\xc3\x9a",
|
|
'Ucirc' => "\xc3\x9b",
|
|
'Ucirc;' => "\xc3\x9b",
|
|
'Ugrave' => "\xc3\x99",
|
|
'Ugrave;' => "\xc3\x99",
|
|
'Upsilon;' => "\xce\xa5",
|
|
'Uuml' => "\xc3\x9c",
|
|
'Uuml;' => "\xc3\x9c",
|
|
'Xi;' => "\xce\x9e",
|
|
'Yacute' => "\xc3\x9d",
|
|
'Yacute;' => "\xc3\x9d",
|
|
'Yuml;' => "\xc5\xb8",
|
|
'Zeta;' => "\xce\x96",
|
|
'aacute' => "\xc3\xa1",
|
|
'aacute;' => "\xc3\xa1",
|
|
'acirc' => "\xc3\xa2",
|
|
'acirc;' => "\xc3\xa2",
|
|
'acute' => "\xc2\xb4",
|
|
'acute;' => "\xc2\xb4",
|
|
'aelig' => "\xc3\xa6",
|
|
'aelig;' => "\xc3\xa6",
|
|
'agrave' => "\xc3\xa0",
|
|
'agrave;' => "\xc3\xa0",
|
|
'alefsym;' => "\xe2\x84\xb5",
|
|
'alpha;' => "\xce\xb1",
|
|
'amp' => '&',
|
|
'amp;' => '&',
|
|
'and;' => "\xe2\x88\xa7",
|
|
'ang;' => "\xe2\x88\xa0",
|
|
'apos;' => "'",
|
|
'aring' => "\xc3\xa5",
|
|
'aring;' => "\xc3\xa5",
|
|
'asymp;' => "\xe2\x89\x88",
|
|
'atilde' => "\xc3\xa3",
|
|
'atilde;' => "\xc3\xa3",
|
|
'auml' => "\xc3\xa4",
|
|
'auml;' => "\xc3\xa4",
|
|
'bdquo;' => "\xe2\x80\x9e",
|
|
'beta;' => "\xce\xb2",
|
|
'brvbar' => "\xc2\xa6",
|
|
'brvbar;' => "\xc2\xa6",
|
|
'bull;' => "\xe2\x80\xa2",
|
|
'cap;' => "\xe2\x88\xa9",
|
|
'ccedil' => "\xc3\xa7",
|
|
'ccedil;' => "\xc3\xa7",
|
|
'cedil' => "\xc2\xb8",
|
|
'cedil;' => "\xc2\xb8",
|
|
'cent' => "\xc2\xa2",
|
|
'cent;' => "\xc2\xa2",
|
|
'chi;' => "\xcf\x87",
|
|
'circ;' => "\xcb\x86",
|
|
'clubs;' => "\xe2\x99\xa3",
|
|
'cong;' => "\xe2\x89\x85",
|
|
'copy' => "\xc2\xa9",
|
|
'copy;' => "\xc2\xa9",
|
|
'crarr;' => "\xe2\x86\xb5",
|
|
'cup;' => "\xe2\x88\xaa",
|
|
'curren' => "\xc2\xa4",
|
|
'curren;' => "\xc2\xa4",
|
|
'dArr;' => "\xe2\x87\x93",
|
|
'dagger;' => "\xe2\x80\xa0",
|
|
'darr;' => "\xe2\x86\x93",
|
|
'deg' => "\xc2\xb0",
|
|
'deg;' => "\xc2\xb0",
|
|
'delta;' => "\xce\xb4",
|
|
'diams;' => "\xe2\x99\xa6",
|
|
'divide' => "\xc3\xb7",
|
|
'divide;' => "\xc3\xb7",
|
|
'eacute' => "\xc3\xa9",
|
|
'eacute;' => "\xc3\xa9",
|
|
'ecirc' => "\xc3\xaa",
|
|
'ecirc;' => "\xc3\xaa",
|
|
'egrave' => "\xc3\xa8",
|
|
'egrave;' => "\xc3\xa8",
|
|
'empty;' => "\xe2\x88\x85",
|
|
'emsp;' => "\xe2\x80\x83",
|
|
'ensp;' => "\xe2\x80\x82",
|
|
'epsilon;' => "\xce\xb5",
|
|
'equiv;' => "\xe2\x89\xa1",
|
|
'eta;' => "\xce\xb7",
|
|
'eth' => "\xc3\xb0",
|
|
'eth;' => "\xc3\xb0",
|
|
'euml' => "\xc3\xab",
|
|
'euml;' => "\xc3\xab",
|
|
'euro;' => "\xe2\x82\xac",
|
|
'exist;' => "\xe2\x88\x83",
|
|
'fnof;' => "\xc6\x92",
|
|
'forall;' => "\xe2\x88\x80",
|
|
'frac12' => "\xc2\xbd",
|
|
'frac12;' => "\xc2\xbd",
|
|
'frac14' => "\xc2\xbc",
|
|
'frac14;' => "\xc2\xbc",
|
|
'frac34' => "\xc2\xbe",
|
|
'frac34;' => "\xc2\xbe",
|
|
'frasl;' => "\xe2\x81\x84",
|
|
'gamma;' => "\xce\xb3",
|
|
'ge;' => "\xe2\x89\xa5",
|
|
'gt' => '>',
|
|
'gt;' => '>',
|
|
'hArr;' => "\xe2\x87\x94",
|
|
'harr;' => "\xe2\x86\x94",
|
|
'hearts;' => "\xe2\x99\xa5",
|
|
'hellip;' => "\xe2\x80\xa6",
|
|
'iacute' => "\xc3\xad",
|
|
'iacute;' => "\xc3\xad",
|
|
'icirc' => "\xc3\xae",
|
|
'icirc;' => "\xc3\xae",
|
|
'iexcl' => "\xc2\xa1",
|
|
'iexcl;' => "\xc2\xa1",
|
|
'igrave' => "\xc3\xac",
|
|
'igrave;' => "\xc3\xac",
|
|
'image;' => "\xe2\x84\x91",
|
|
'infin;' => "\xe2\x88\x9e",
|
|
'int;' => "\xe2\x88\xab",
|
|
'iota;' => "\xce\xb9",
|
|
'iquest' => "\xc2\xbf",
|
|
'iquest;' => "\xc2\xbf",
|
|
'isin;' => "\xe2\x88\x88",
|
|
'iuml' => "\xc3\xaf",
|
|
'iuml;' => "\xc3\xaf",
|
|
'kappa;' => "\xce\xba",
|
|
'lArr;' => "\xe2\x87\x90",
|
|
'lambda;' => "\xce\xbb",
|
|
'lang;' => "\xe3\x80\x88",
|
|
'laquo' => "\xc2\xab",
|
|
'laquo;' => "\xc2\xab",
|
|
'larr;' => "\xe2\x86\x90",
|
|
'lceil;' => "\xe2\x8c\x88",
|
|
'ldquo;' => "\xe2\x80\x9c",
|
|
'le;' => "\xe2\x89\xa4",
|
|
'lfloor;' => "\xe2\x8c\x8a",
|
|
'lowast;' => "\xe2\x88\x97",
|
|
'loz;' => "\xe2\x97\x8a",
|
|
'lrm;' => "\xe2\x80\x8e",
|
|
'lsaquo;' => "\xe2\x80\xb9",
|
|
'lsquo;' => "\xe2\x80\x98",
|
|
'lt' => '<',
|
|
'lt;' => '<',
|
|
'macr' => "\xc2\xaf",
|
|
'macr;' => "\xc2\xaf",
|
|
'mdash;' => "\xe2\x80\x94",
|
|
'micro' => "\xc2\xb5",
|
|
'micro;' => "\xc2\xb5",
|
|
'middot' => "\xc2\xb7",
|
|
'middot;' => "\xc2\xb7",
|
|
'minus;' => "\xe2\x88\x92",
|
|
'mu;' => "\xce\xbc",
|
|
'nabla;' => "\xe2\x88\x87",
|
|
'nbsp' => "\xc2\xa0",
|
|
'nbsp;' => "\xc2\xa0",
|
|
'ndash;' => "\xe2\x80\x93",
|
|
'ne;' => "\xe2\x89\xa0",
|
|
'ni;' => "\xe2\x88\x8b",
|
|
'not' => "\xc2\xac",
|
|
'not;' => "\xc2\xac",
|
|
'notin;' => "\xe2\x88\x89",
|
|
'nsub;' => "\xe2\x8a\x84",
|
|
'ntilde' => "\xc3\xb1",
|
|
'ntilde;' => "\xc3\xb1",
|
|
'nu;' => "\xce\xbd",
|
|
'oacute' => "\xc3\xb3",
|
|
'oacute;' => "\xc3\xb3",
|
|
'ocirc' => "\xc3\xb4",
|
|
'ocirc;' => "\xc3\xb4",
|
|
'oelig;' => "\xc5\x93",
|
|
'ograve' => "\xc3\xb2",
|
|
'ograve;' => "\xc3\xb2",
|
|
'oline;' => "\xe2\x80\xbe",
|
|
'omega;' => "\xcf\x89",
|
|
'omicron;' => "\xce\xbf",
|
|
'oplus;' => "\xe2\x8a\x95",
|
|
'or;' => "\xe2\x88\xa8",
|
|
'ordf' => "\xc2\xaa",
|
|
'ordf;' => "\xc2\xaa",
|
|
'ordm' => "\xc2\xba",
|
|
'ordm;' => "\xc2\xba",
|
|
'oslash' => "\xc3\xb8",
|
|
'oslash;' => "\xc3\xb8",
|
|
'otilde' => "\xc3\xb5",
|
|
'otilde;' => "\xc3\xb5",
|
|
'otimes;' => "\xe2\x8a\x97",
|
|
'ouml' => "\xc3\xb6",
|
|
'ouml;' => "\xc3\xb6",
|
|
'para' => "\xc2\xb6",
|
|
'para;' => "\xc2\xb6",
|
|
'part;' => "\xe2\x88\x82",
|
|
'permil;' => "\xe2\x80\xb0",
|
|
'perp;' => "\xe2\x8a\xa5",
|
|
'phi;' => "\xcf\x86",
|
|
'pi;' => "\xcf\x80",
|
|
'piv;' => "\xcf\x96",
|
|
'plusmn' => "\xc2\xb1",
|
|
'plusmn;' => "\xc2\xb1",
|
|
'pound' => "\xc2\xa3",
|
|
'pound;' => "\xc2\xa3",
|
|
'prime;' => "\xe2\x80\xb2",
|
|
'prod;' => "\xe2\x88\x8f",
|
|
'prop;' => "\xe2\x88\x9d",
|
|
'psi;' => "\xcf\x88",
|
|
'quot' => '"',
|
|
'quot;' => '"',
|
|
'rArr;' => "\xe2\x87\x92",
|
|
'radic;' => "\xe2\x88\x9a",
|
|
'rang;' => "\xe3\x80\x89",
|
|
'raquo' => "\xc2\xbb",
|
|
'raquo;' => "\xc2\xbb",
|
|
'rarr;' => "\xe2\x86\x92",
|
|
'rceil;' => "\xe2\x8c\x89",
|
|
'rdquo;' => "\xe2\x80\x9d",
|
|
'real;' => "\xe2\x84\x9c",
|
|
'reg' => "\xc2\xae",
|
|
'reg;' => "\xc2\xae",
|
|
'rfloor;' => "\xe2\x8c\x8b",
|
|
'rho;' => "\xcf\x81",
|
|
'rlm;' => "\xe2\x80\x8f",
|
|
'rsaquo;' => "\xe2\x80\xba",
|
|
'rsquo;' => "\xe2\x80\x99",
|
|
'sbquo;' => "\xe2\x80\x9a",
|
|
'scaron;' => "\xc5\xa1",
|
|
'sdot;' => "\xe2\x8b\x85",
|
|
'sect' => "\xc2\xa7",
|
|
'sect;' => "\xc2\xa7",
|
|
'shy' => "\xc2\xad",
|
|
'shy;' => "\xc2\xad",
|
|
'sigma;' => "\xcf\x83",
|
|
'sigmaf;' => "\xcf\x82",
|
|
'sim;' => "\xe2\x88\xbc",
|
|
'spades;' => "\xe2\x99\xa0",
|
|
'sub;' => "\xe2\x8a\x82",
|
|
'sube;' => "\xe2\x8a\x86",
|
|
'sum;' => "\xe2\x88\x91",
|
|
'sup1' => "\xc2\xb9",
|
|
'sup1;' => "\xc2\xb9",
|
|
'sup2' => "\xc2\xb2",
|
|
'sup2;' => "\xc2\xb2",
|
|
'sup3' => "\xc2\xb3",
|
|
'sup3;' => "\xc2\xb3",
|
|
'sup;' => "\xe2\x8a\x83",
|
|
'supe;' => "\xe2\x8a\x87",
|
|
'szlig' => "\xc3\x9f",
|
|
'szlig;' => "\xc3\x9f",
|
|
'tau;' => "\xcf\x84",
|
|
'there4;' => "\xe2\x88\xb4",
|
|
'theta;' => "\xce\xb8",
|
|
'thetasym;' => "\xcf\x91",
|
|
'thinsp;' => "\xe2\x80\x89",
|
|
'thorn' => "\xc3\xbe",
|
|
'thorn;' => "\xc3\xbe",
|
|
'tilde;' => "\xcb\x9c",
|
|
'times' => "\xc3\x97",
|
|
'times;' => "\xc3\x97",
|
|
'trade;' => "\xe2\x84\xa2",
|
|
'uArr;' => "\xe2\x87\x91",
|
|
'uacute' => "\xc3\xba",
|
|
'uacute;' => "\xc3\xba",
|
|
'uarr;' => "\xe2\x86\x91",
|
|
'ucirc' => "\xc3\xbb",
|
|
'ucirc;' => "\xc3\xbb",
|
|
'ugrave' => "\xc3\xb9",
|
|
'ugrave;' => "\xc3\xb9",
|
|
'uml' => "\xc2\xa8",
|
|
'uml;' => "\xc2\xa8",
|
|
'upsih;' => "\xcf\x92",
|
|
'upsilon;' => "\xcf\x85",
|
|
'uuml' => "\xc3\xbc",
|
|
'uuml;' => "\xc3\xbc",
|
|
'weierp;' => "\xe2\x84\x98",
|
|
'xi;' => "\xce\xbe",
|
|
'yacute' => "\xc3\xbd",
|
|
'yacute;' => "\xc3\xbd",
|
|
'yen' => "\xc2\xa5",
|
|
'yen;' => "\xc2\xa5",
|
|
'yuml' => "\xc3\xbf",
|
|
'yuml;' => "\xc3\xbf",
|
|
'zeta;' => "\xce\xb6",
|
|
'zwj;' => "\xe2\x80\x8d",
|
|
'zwnj;' => "\xe2\x80\x8c"
|
|
}
|
|
|
|
ENCODINGS = %w[
|
|
ansi_x3.4-1968
|
|
iso-ir-6
|
|
ansi_x3.4-1986
|
|
iso_646.irv:1991
|
|
ascii
|
|
iso646-us
|
|
us-ascii
|
|
us
|
|
ibm367
|
|
cp367
|
|
csascii
|
|
ks_c_5601-1987
|
|
korean
|
|
iso-2022-kr
|
|
csiso2022kr
|
|
euc-kr
|
|
iso-2022-jp
|
|
csiso2022jp
|
|
iso-2022-jp-2
|
|
iso-ir-58
|
|
chinese
|
|
csiso58gb231280
|
|
iso_8859-1:1987
|
|
iso-ir-100
|
|
iso_8859-1
|
|
iso-8859-1
|
|
latin1
|
|
l1
|
|
ibm819
|
|
cp819
|
|
csisolatin1
|
|
iso_8859-2:1987
|
|
iso-ir-101
|
|
iso_8859-2
|
|
iso-8859-2
|
|
latin2
|
|
l2
|
|
csisolatin2
|
|
iso_8859-3:1988
|
|
iso-ir-109
|
|
iso_8859-3
|
|
iso-8859-3
|
|
latin3
|
|
l3
|
|
csisolatin3
|
|
iso_8859-4:1988
|
|
iso-ir-110
|
|
iso_8859-4
|
|
iso-8859-4
|
|
latin4
|
|
l4
|
|
csisolatin4
|
|
iso_8859-6:1987
|
|
iso-ir-127
|
|
iso_8859-6
|
|
iso-8859-6
|
|
ecma-114
|
|
asmo-708
|
|
arabic
|
|
csisolatinarabic
|
|
iso_8859-7:1987
|
|
iso-ir-126
|
|
iso_8859-7
|
|
iso-8859-7
|
|
elot_928
|
|
ecma-118
|
|
greek
|
|
greek8
|
|
csisolatingreek
|
|
iso_8859-8:1988
|
|
iso-ir-138
|
|
iso_8859-8
|
|
iso-8859-8
|
|
hebrew
|
|
csisolatinhebrew
|
|
iso_8859-5:1988
|
|
iso-ir-144
|
|
iso_8859-5
|
|
iso-8859-5
|
|
cyrillic
|
|
csisolatincyrillic
|
|
iso_8859-9:1989
|
|
iso-ir-148
|
|
iso_8859-9
|
|
iso-8859-9
|
|
latin5
|
|
l5
|
|
csisolatin5
|
|
iso-8859-10
|
|
iso-ir-157
|
|
l6
|
|
iso_8859-10:1992
|
|
csisolatin6
|
|
latin6
|
|
hp-roman8
|
|
roman8
|
|
r8
|
|
ibm037
|
|
cp037
|
|
csibm037
|
|
ibm424
|
|
cp424
|
|
csibm424
|
|
ibm437
|
|
cp437
|
|
437
|
|
cspc8codepage437
|
|
ibm500
|
|
cp500
|
|
csibm500
|
|
ibm775
|
|
cp775
|
|
cspc775baltic
|
|
ibm850
|
|
cp850
|
|
850
|
|
cspc850multilingual
|
|
ibm852
|
|
cp852
|
|
852
|
|
cspcp852
|
|
ibm855
|
|
cp855
|
|
855
|
|
csibm855
|
|
ibm857
|
|
cp857
|
|
857
|
|
csibm857
|
|
ibm860
|
|
cp860
|
|
860
|
|
csibm860
|
|
ibm861
|
|
cp861
|
|
861
|
|
cp-is
|
|
csibm861
|
|
ibm862
|
|
cp862
|
|
862
|
|
cspc862latinhebrew
|
|
ibm863
|
|
cp863
|
|
863
|
|
csibm863
|
|
ibm864
|
|
cp864
|
|
csibm864
|
|
ibm865
|
|
cp865
|
|
865
|
|
csibm865
|
|
ibm866
|
|
cp866
|
|
866
|
|
csibm866
|
|
ibm869
|
|
cp869
|
|
869
|
|
cp-gr
|
|
csibm869
|
|
ibm1026
|
|
cp1026
|
|
csibm1026
|
|
koi8-r
|
|
cskoi8r
|
|
koi8-u
|
|
big5-hkscs
|
|
ptcp154
|
|
csptcp154
|
|
pt154
|
|
cp154
|
|
utf-7
|
|
utf-16be
|
|
utf-16le
|
|
utf-16
|
|
utf-8
|
|
iso-8859-13
|
|
iso-8859-14
|
|
iso-ir-199
|
|
iso_8859-14:1998
|
|
iso_8859-14
|
|
latin8
|
|
iso-celtic
|
|
l8
|
|
iso-8859-15
|
|
iso_8859-15
|
|
iso-8859-16
|
|
iso-ir-226
|
|
iso_8859-16:2001
|
|
iso_8859-16
|
|
latin10
|
|
l10
|
|
gbk
|
|
cp936
|
|
ms936
|
|
gb18030
|
|
shift_jis
|
|
ms_kanji
|
|
csshiftjis
|
|
euc-jp
|
|
gb2312
|
|
big5
|
|
csbig5
|
|
windows-1250
|
|
windows-1251
|
|
windows-1252
|
|
windows-1253
|
|
windows-1254
|
|
windows-1255
|
|
windows-1256
|
|
windows-1257
|
|
windows-1258
|
|
tis-620
|
|
hz-gb-2312
|
|
]
|
|
|
|
E = {
|
|
"null-character" =>
|
|
_("Null character in input stream, replaced with U+FFFD."),
|
|
"incorrectly-placed-solidus" =>
|
|
_("Solidus (/) incorrectly placed in tag."),
|
|
"incorrect-cr-newline-entity" =>
|
|
_("Incorrect CR newline entity, replaced with LF."),
|
|
"illegal-windows-1252-entity" =>
|
|
_("Entity used with illegal number (windows-1252 reference)."),
|
|
"cant-convert-numeric-entity" =>
|
|
_("Numeric entity couldn't be converted to character " +
|
|
"(codepoint U+%(charAsInt)08x)."),
|
|
"illegal-codepoint-for-numeric-entity" =>
|
|
_("Numeric entity represents an illegal codepoint=> " +
|
|
"U+%(charAsInt)08x."),
|
|
"numeric-entity-without-semicolon" =>
|
|
_("Numeric entity didn't end with ';'."),
|
|
"expected-numeric-entity-but-got-eof" =>
|
|
_("Numeric entity expected. Got end of file instead."),
|
|
"expected-numeric-entity" =>
|
|
_("Numeric entity expected but none found."),
|
|
"named-entity-without-semicolon" =>
|
|
_("Named entity didn't end with ';'."),
|
|
"expected-named-entity" =>
|
|
_("Named entity expected. Got none."),
|
|
"attributes-in-end-tag" =>
|
|
_("End tag contains unexpected attributes."),
|
|
"expected-tag-name-but-got-right-bracket" =>
|
|
_("Expected tag name. Got '>' instead."),
|
|
"expected-tag-name-but-got-question-mark" =>
|
|
_("Expected tag name. Got '?' instead. (HTML doesn't " +
|
|
"support processing instructions.)"),
|
|
"expected-tag-name" =>
|
|
_("Expected tag name. Got something else instead"),
|
|
"expected-closing-tag-but-got-right-bracket" =>
|
|
_("Expected closing tag. Got '>' instead. Ignoring '</>'."),
|
|
"expected-closing-tag-but-got-eof" =>
|
|
_("Expected closing tag. Unexpected end of file."),
|
|
"expected-closing-tag-but-got-char" =>
|
|
_("Expected closing tag. Unexpected character '%(data)' found."),
|
|
"eof-in-tag-name" =>
|
|
_("Unexpected end of file in the tag name."),
|
|
"expected-attribute-name-but-got-eof" =>
|
|
_("Unexpected end of file. Expected attribute name instead."),
|
|
"eof-in-attribute-name" =>
|
|
_("Unexpected end of file in attribute name."),
|
|
"duplicate-attribute" =>
|
|
_("Dropped duplicate attribute on tag."),
|
|
"expected-end-of-tag-name-but-got-eof" =>
|
|
_("Unexpected end of file. Expected = or end of tag."),
|
|
"expected-attribute-value-but-got-eof" =>
|
|
_("Unexpected end of file. Expected attribute value."),
|
|
"eof-in-attribute-value-double-quote" =>
|
|
_("Unexpected end of file in attribute value (\")."),
|
|
"eof-in-attribute-value-single-quote" =>
|
|
_("Unexpected end of file in attribute value (')."),
|
|
"eof-in-attribute-value-no-quotes" =>
|
|
_("Unexpected end of file in attribute value."),
|
|
"expected-dashes-or-doctype" =>
|
|
_("Expected '--' or 'DOCTYPE'. Not found."),
|
|
"incorrect-comment" =>
|
|
_("Incorrect comment."),
|
|
"eof-in-comment" =>
|
|
_("Unexpected end of file in comment."),
|
|
"eof-in-comment-end-dash" =>
|
|
_("Unexpected end of file in comment (-)"),
|
|
"unexpected-dash-after-double-dash-in-comment" =>
|
|
_("Unexpected '-' after '--' found in comment."),
|
|
"eof-in-comment-double-dash" =>
|
|
_("Unexpected end of file in comment (--)."),
|
|
"unexpected-char-in-comment" =>
|
|
_("Unexpected character in comment found."),
|
|
"need-space-after-doctype" =>
|
|
_("No space after literal string 'DOCTYPE'."),
|
|
"expected-doctype-name-but-got-right-bracket" =>
|
|
_("Unexpected > character. Expected DOCTYPE name."),
|
|
"expected-doctype-name-but-got-eof" =>
|
|
_("Unexpected end of file. Expected DOCTYPE name."),
|
|
"eof-in-doctype-name" =>
|
|
_("Unexpected end of file in DOCTYPE name."),
|
|
"eof-in-doctype" =>
|
|
_("Unexpected end of file in DOCTYPE."),
|
|
"expected-space-or-right-bracket-in-doctype" =>
|
|
_("Expected space or '>'. Got '%(data)'"),
|
|
"unexpected-end-of-doctype" =>
|
|
_("Unexpected end of DOCTYPE."),
|
|
"unexpected-char-in-doctype" =>
|
|
_("Unexpected character in DOCTYPE."),
|
|
"eof-in-bogus-doctype" =>
|
|
_("Unexpected end of file in bogus doctype."),
|
|
"eof-in-innerhtml" =>
|
|
_("Unexpected EOF in inner html mode."),
|
|
"unexpected-doctype" =>
|
|
_("Unexpected DOCTYPE. Ignored."),
|
|
"non-html-root" =>
|
|
_("html needs to be the first start tag."),
|
|
"expected-doctype-but-got-eof" =>
|
|
_("Unexpected End of file. Expected DOCTYPE."),
|
|
"unknown-doctype" =>
|
|
_("Erroneous DOCTYPE."),
|
|
"expected-doctype-but-got-chars" =>
|
|
_("Unexpected non-space characters. Expected DOCTYPE."),
|
|
"expected-doctype-but-got-start-tag" =>
|
|
_("Unexpected start tag (%(name)). Expected DOCTYPE."),
|
|
"expected-doctype-but-got-end-tag" =>
|
|
_("Unexpected end tag (%(name)). Expected DOCTYPE."),
|
|
"end-tag-after-implied-root" =>
|
|
_("Unexpected end tag (%(name)) after the (implied) root element."),
|
|
"expected-named-closing-tag-but-got-eof" =>
|
|
_("Unexpected end of file. Expected end tag (%(name))."),
|
|
"two-heads-are-not-better-than-one" =>
|
|
_("Unexpected start tag head in existing head. Ignored."),
|
|
"unexpected-end-tag" =>
|
|
_("Unexpected end tag (%(name)). Ignored."),
|
|
"unexpected-start-tag-out-of-my-head" =>
|
|
_("Unexpected start tag (%(name)) that can be in head. Moved."),
|
|
"unexpected-start-tag" =>
|
|
_("Unexpected start tag (%(name))."),
|
|
"missing-end-tag" =>
|
|
_("Missing end tag (%(name))."),
|
|
"missing-end-tags" =>
|
|
_("Missing end tags (%(name))."),
|
|
"unexpected-start-tag-implies-end-tag" =>
|
|
_("Unexpected start tag (%(startName)) " +
|
|
"implies end tag (%(endName))."),
|
|
"unexpected-start-tag-treated-as" =>
|
|
_("Unexpected start tag (%(originalName)). Treated as %(newName)."),
|
|
"deprecated-tag" =>
|
|
_("Unexpected start tag %(name). Don't use it!"),
|
|
"unexpected-start-tag-ignored" =>
|
|
_("Unexpected start tag %(name). Ignored."),
|
|
"expected-one-end-tag-but-got-another" =>
|
|
_("Unexpected end tag (%(gotName)). " +
|
|
"Missing end tag (%(expectedName))."),
|
|
"end-tag-too-early" =>
|
|
_("End tag (%(name)) seen too early. Expected other end tag."),
|
|
"end-tag-too-early-named" =>
|
|
_("Unexpected end tag (%(gotName)). Expected end tag (%(expectedName))."),
|
|
"end-tag-too-early-ignored" =>
|
|
_("End tag (%(name)) seen too early. Ignored."),
|
|
"adoption-agency-1.1" =>
|
|
_("End tag (%(name)) violates step 1, " +
|
|
"paragraph 1 of the adoption agency algorithm."),
|
|
"adoption-agency-1.2" =>
|
|
_("End tag (%(name)) violates step 1, " +
|
|
"paragraph 2 of the adoption agency algorithm."),
|
|
"adoption-agency-1.3" =>
|
|
_("End tag (%(name)) violates step 1, " +
|
|
"paragraph 3 of the adoption agency algorithm."),
|
|
"unexpected-end-tag-treated-as" =>
|
|
_("Unexpected end tag (%(originalName)). Treated as %(newName)."),
|
|
"no-end-tag" =>
|
|
_("This element (%(name)) has no end tag."),
|
|
"unexpected-implied-end-tag-in-table" =>
|
|
_("Unexpected implied end tag (%(name)) in the table phase."),
|
|
"unexpected-implied-end-tag-in-table-body" =>
|
|
_("Unexpected implied end tag (%(name)) in the table body phase."),
|
|
"unexpected-char-implies-table-voodoo" =>
|
|
_("Unexpected non-space characters in " +
|
|
"table context caused voodoo mode."),
|
|
"unexpected-start-tag-implies-table-voodoo" =>
|
|
_("Unexpected start tag (%(name)) in " +
|
|
"table context caused voodoo mode."),
|
|
"unexpected-end-tag-implies-table-voodoo" =>
|
|
_("Unexpected end tag (%(name)) in " +
|
|
"table context caused voodoo mode."),
|
|
"unexpected-cell-in-table-body" =>
|
|
_("Unexpected table cell start tag (%(name)) " +
|
|
"in the table body phase."),
|
|
"unexpected-cell-end-tag" =>
|
|
_("Got table cell end tag (%(name)) " +
|
|
"while required end tags are missing."),
|
|
"unexpected-end-tag-in-table-body" =>
|
|
_("Unexpected end tag (%(name)) in the table body phase. Ignored."),
|
|
"unexpected-implied-end-tag-in-table-row" =>
|
|
_("Unexpected implied end tag (%(name)) in the table row phase."),
|
|
"unexpected-end-tag-in-table-row" =>
|
|
_("Unexpected end tag (%(name)) in the table row phase. Ignored."),
|
|
"unexpected-select-in-select" =>
|
|
_("Unexpected select start tag in the select phase " +
|
|
"implies select start tag."),
|
|
"unexpected-start-tag-in-select" =>
|
|
_("Unexpected start tag token (%(name)) in the select phase. " +
|
|
"Ignored."),
|
|
"unexpected-end-tag-in-select" =>
|
|
_("Unexpected end tag (%(name)) in the select phase. Ignored."),
|
|
"unexpected-char-after-body" =>
|
|
_("Unexpected non-space characters in the after body phase."),
|
|
"unexpected-start-tag-after-body" =>
|
|
_("Unexpected start tag token (%(name))" +
|
|
" in the after body phase."),
|
|
"unexpected-end-tag-after-body" =>
|
|
_("Unexpected end tag token (%(name))" +
|
|
" in the after body phase."),
|
|
"unexpected-char-in-frameset" =>
|
|
_("Unepxected characters in the frameset phase. Characters ignored."),
|
|
"unexpected-start-tag-in-frameset" =>
|
|
_("Unexpected start tag token (%(name))" +
|
|
" in the frameset phase. Ignored."),
|
|
"unexpected-frameset-in-frameset-innerhtml" =>
|
|
_("Unexpected end tag token (frameset) " +
|
|
"in the frameset phase (innerHTML)."),
|
|
"unexpected-end-tag-in-frameset" =>
|
|
_("Unexpected end tag token (%(name))" +
|
|
" in the frameset phase. Ignored."),
|
|
"unexpected-char-after-frameset" =>
|
|
_("Unexpected non-space characters in the " +
|
|
"after frameset phase. Ignored."),
|
|
"unexpected-start-tag-after-frameset" =>
|
|
_("Unexpected start tag (%(name))" +
|
|
" in the after frameset phase. Ignored."),
|
|
"unexpected-end-tag-after-frameset" =>
|
|
_("Unexpected end tag (%(name))" +
|
|
" in the after frameset phase. Ignored."),
|
|
"expected-eof-but-got-char" =>
|
|
_("Unexpected non-space characters. Expected end of file."),
|
|
"expected-eof-but-got-start-tag" =>
|
|
_("Unexpected start tag (%(name))" +
|
|
". Expected end of file."),
|
|
"expected-eof-but-got-end-tag" =>
|
|
_("Unexpected end tag (%(name))" +
|
|
". Expected end of file."),
|
|
"unexpected-end-table-in-caption" =>
|
|
_("Unexpected end table tag in caption. Generates implied end caption."),
|
|
"end-html-in-innerhtml" => _("Unexpected html end tag in inner html mode.")
|
|
}
|
|
|
|
end
|