Sync with latest HTML5lib and latest Maruku
This commit is contained in:
parent
8e92e4a3ab
commit
8ccaad85a5
71 changed files with 1974 additions and 1621 deletions
|
@ -1,11 +1,11 @@
|
|||
require 'html5lib/html5parser'
|
||||
|
||||
module HTML5lib
|
||||
def self.parse(stream, options={})
|
||||
HTMLParser.parse(stream, options)
|
||||
end
|
||||
|
||||
def self.parseFragment(stream, options={})
|
||||
HTMLParser.parse(stream, options)
|
||||
end
|
||||
end
|
||||
require 'html5/html5parser'
|
||||
|
||||
module HTML5
|
||||
def self.parse(stream, options={})
|
||||
HTMLParser.parse(stream, options)
|
||||
end
|
||||
|
||||
def self.parseFragment(stream, options={})
|
||||
HTMLParser.parse(stream, options)
|
||||
end
|
||||
end
|
817
vendor/plugins/HTML5lib/lib/html5/constants.rb
vendored
Executable file
817
vendor/plugins/HTML5lib/lib/html5/constants.rb
vendored
Executable file
|
@ -0,0 +1,817 @@
|
|||
module HTML5
|
||||
|
||||
class EOF < Exception; end
|
||||
|
||||
CONTENT_MODEL_FLAGS = [
|
||||
:PCDATA,
|
||||
:RCDATA,
|
||||
:CDATA,
|
||||
:PLAINTEXT
|
||||
]
|
||||
|
||||
SCOPING_ELEMENTS = %w[
|
||||
button
|
||||
caption
|
||||
html
|
||||
marquee
|
||||
object
|
||||
table
|
||||
td
|
||||
th
|
||||
]
|
||||
|
||||
FORMATTING_ELEMENTS = %w[
|
||||
a
|
||||
b
|
||||
big
|
||||
em
|
||||
font
|
||||
i
|
||||
nobr
|
||||
s
|
||||
small
|
||||
strike
|
||||
strong
|
||||
tt
|
||||
u
|
||||
]
|
||||
|
||||
SPECIAL_ELEMENTS = %w[
|
||||
address
|
||||
area
|
||||
base
|
||||
basefont
|
||||
bgsound
|
||||
blockquote
|
||||
body
|
||||
br
|
||||
center
|
||||
col
|
||||
colgroup
|
||||
dd
|
||||
dir
|
||||
div
|
||||
dl
|
||||
dt
|
||||
embed
|
||||
fieldset
|
||||
form
|
||||
frame
|
||||
frameset
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
h4
|
||||
h5
|
||||
h6
|
||||
head
|
||||
hr
|
||||
iframe
|
||||
image
|
||||
img
|
||||
input
|
||||
isindex
|
||||
li
|
||||
link
|
||||
listing
|
||||
menu
|
||||
meta
|
||||
noembed
|
||||
noframes
|
||||
noscript
|
||||
ol
|
||||
optgroup
|
||||
option
|
||||
p
|
||||
param
|
||||
plaintext
|
||||
pre
|
||||
script
|
||||
select
|
||||
spacer
|
||||
style
|
||||
tbody
|
||||
textarea
|
||||
tfoot
|
||||
thead
|
||||
title
|
||||
tr
|
||||
ul
|
||||
wbr
|
||||
]
|
||||
|
||||
SPACE_CHARACTERS = %W[
|
||||
\t
|
||||
\n
|
||||
\x0B
|
||||
\x0C
|
||||
\x20
|
||||
\r
|
||||
]
|
||||
|
||||
TABLE_INSERT_MODE_ELEMENTS = %w[
|
||||
table
|
||||
tbody
|
||||
tfoot
|
||||
thead
|
||||
tr
|
||||
]
|
||||
|
||||
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
|
||||
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
|
||||
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
|
||||
DIGITS = '0'..'9'
|
||||
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
|
||||
|
||||
# Heading elements need to be ordered
|
||||
HEADING_ELEMENTS = %w[
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
h4
|
||||
h5
|
||||
h6
|
||||
]
|
||||
|
||||
# XXX What about event-source and command?
|
||||
VOID_ELEMENTS = %w[
|
||||
base
|
||||
link
|
||||
meta
|
||||
hr
|
||||
br
|
||||
img
|
||||
embed
|
||||
param
|
||||
area
|
||||
col
|
||||
input
|
||||
]
|
||||
|
||||
CDATA_ELEMENTS = %w[title textarea]
|
||||
|
||||
RCDATA_ELEMENTS = %w[
|
||||
style
|
||||
script
|
||||
xmp
|
||||
iframe
|
||||
noembed
|
||||
noframes
|
||||
noscript
|
||||
]
|
||||
|
||||
BOOLEAN_ATTRIBUTES = {
|
||||
:global => %w[irrelevant],
|
||||
'style' => %w[scoped],
|
||||
'img' => %w[ismap],
|
||||
'audio' => %w[autoplay controls],
|
||||
'video' => %w[autoplay controls],
|
||||
'script' => %w[defer async],
|
||||
'details' => %w[open],
|
||||
'datagrid' => %w[multiple disabled],
|
||||
'command' => %w[hidden disabled checked default],
|
||||
'menu' => %w[autosubmit],
|
||||
'fieldset' => %w[disabled readonly],
|
||||
'option' => %w[disabled readonly selected],
|
||||
'optgroup' => %w[disabled readonly],
|
||||
'button' => %w[disabled autofocus],
|
||||
'input' => %w[disabled readonly required autofocus checked ismap],
|
||||
'select' => %w[disabled readonly autofocus multiple],
|
||||
'output' => %w[disabled readonly]
|
||||
}
|
||||
|
||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
|
||||
ENTITIES_WINDOWS1252 = [
|
||||
8364, # 0x80 0x20AC EURO SIGN
|
||||
65533, # 0x81 UNDEFINED
|
||||
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
||||
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
||||
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
||||
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
||||
8224, # 0x86 0x2020 DAGGER
|
||||
8225, # 0x87 0x2021 DOUBLE DAGGER
|
||||
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||
8240, # 0x89 0x2030 PER MILLE SIGN
|
||||
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
||||
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
||||
65533, # 0x8D UNDEFINED
|
||||
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
||||
65533, # 0x8F UNDEFINED
|
||||
65533, # 0x90 UNDEFINED
|
||||
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
||||
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
||||
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
||||
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
||||
8226, # 0x95 0x2022 BULLET
|
||||
8211, # 0x96 0x2013 EN DASH
|
||||
8212, # 0x97 0x2014 EM DASH
|
||||
732, # 0x98 0x02DC SMALL TILDE
|
||||
8482, # 0x99 0x2122 TRADE MARK SIGN
|
||||
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
||||
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
||||
65533, # 0x9D UNDEFINED
|
||||
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
||||
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
]
|
||||
|
||||
# ENTITIES was generated from Python using the following code:
|
||||
#
|
||||
# import constants
|
||||
# entities = constants.entities.items()
|
||||
# entities.sort()
|
||||
# list = [ ' '.join([repr(entity), '=>', ord(value)<128 and
|
||||
# repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
|
||||
# for entity, value in entities]
|
||||
# print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }'
|
||||
|
||||
ENTITIES = {
|
||||
'AElig' => "\xc3\x86",
|
||||
'AElig;' => "\xc3\x86",
|
||||
'AMP' => '&',
|
||||
'AMP;' => '&',
|
||||
'Aacute' => "\xc3\x81",
|
||||
'Aacute;' => "\xc3\x81",
|
||||
'Acirc' => "\xc3\x82",
|
||||
'Acirc;' => "\xc3\x82",
|
||||
'Agrave' => "\xc3\x80",
|
||||
'Agrave;' => "\xc3\x80",
|
||||
'Alpha;' => "\xce\x91",
|
||||
'Aring' => "\xc3\x85",
|
||||
'Aring;' => "\xc3\x85",
|
||||
'Atilde' => "\xc3\x83",
|
||||
'Atilde;' => "\xc3\x83",
|
||||
'Auml' => "\xc3\x84",
|
||||
'Auml;' => "\xc3\x84",
|
||||
'Beta;' => "\xce\x92",
|
||||
'COPY' => "\xc2\xa9",
|
||||
'COPY;' => "\xc2\xa9",
|
||||
'Ccedil' => "\xc3\x87",
|
||||
'Ccedil;' => "\xc3\x87",
|
||||
'Chi;' => "\xce\xa7",
|
||||
'Dagger;' => "\xe2\x80\xa1",
|
||||
'Delta;' => "\xce\x94",
|
||||
'ETH' => "\xc3\x90",
|
||||
'ETH;' => "\xc3\x90",
|
||||
'Eacute' => "\xc3\x89",
|
||||
'Eacute;' => "\xc3\x89",
|
||||
'Ecirc' => "\xc3\x8a",
|
||||
'Ecirc;' => "\xc3\x8a",
|
||||
'Egrave' => "\xc3\x88",
|
||||
'Egrave;' => "\xc3\x88",
|
||||
'Epsilon;' => "\xce\x95",
|
||||
'Eta;' => "\xce\x97",
|
||||
'Euml' => "\xc3\x8b",
|
||||
'Euml;' => "\xc3\x8b",
|
||||
'GT' => '>',
|
||||
'GT;' => '>',
|
||||
'Gamma;' => "\xce\x93",
|
||||
'Iacute' => "\xc3\x8d",
|
||||
'Iacute;' => "\xc3\x8d",
|
||||
'Icirc' => "\xc3\x8e",
|
||||
'Icirc;' => "\xc3\x8e",
|
||||
'Igrave' => "\xc3\x8c",
|
||||
'Igrave;' => "\xc3\x8c",
|
||||
'Iota;' => "\xce\x99",
|
||||
'Iuml' => "\xc3\x8f",
|
||||
'Iuml;' => "\xc3\x8f",
|
||||
'Kappa;' => "\xce\x9a",
|
||||
'LT' => '<',
|
||||
'LT;' => '<',
|
||||
'Lambda;' => "\xce\x9b",
|
||||
'Mu;' => "\xce\x9c",
|
||||
'Ntilde' => "\xc3\x91",
|
||||
'Ntilde;' => "\xc3\x91",
|
||||
'Nu;' => "\xce\x9d",
|
||||
'OElig;' => "\xc5\x92",
|
||||
'Oacute' => "\xc3\x93",
|
||||
'Oacute;' => "\xc3\x93",
|
||||
'Ocirc' => "\xc3\x94",
|
||||
'Ocirc;' => "\xc3\x94",
|
||||
'Ograve' => "\xc3\x92",
|
||||
'Ograve;' => "\xc3\x92",
|
||||
'Omega;' => "\xce\xa9",
|
||||
'Omicron;' => "\xce\x9f",
|
||||
'Oslash' => "\xc3\x98",
|
||||
'Oslash;' => "\xc3\x98",
|
||||
'Otilde' => "\xc3\x95",
|
||||
'Otilde;' => "\xc3\x95",
|
||||
'Ouml' => "\xc3\x96",
|
||||
'Ouml;' => "\xc3\x96",
|
||||
'Phi;' => "\xce\xa6",
|
||||
'Pi;' => "\xce\xa0",
|
||||
'Prime;' => "\xe2\x80\xb3",
|
||||
'Psi;' => "\xce\xa8",
|
||||
'QUOT' => '"',
|
||||
'QUOT;' => '"',
|
||||
'REG' => "\xc2\xae",
|
||||
'REG;' => "\xc2\xae",
|
||||
'Rho;' => "\xce\xa1",
|
||||
'Scaron;' => "\xc5\xa0",
|
||||
'Sigma;' => "\xce\xa3",
|
||||
'THORN' => "\xc3\x9e",
|
||||
'THORN;' => "\xc3\x9e",
|
||||
'TRADE;' => "\xe2\x84\xa2",
|
||||
'Tau;' => "\xce\xa4",
|
||||
'Theta;' => "\xce\x98",
|
||||
'Uacute' => "\xc3\x9a",
|
||||
'Uacute;' => "\xc3\x9a",
|
||||
'Ucirc' => "\xc3\x9b",
|
||||
'Ucirc;' => "\xc3\x9b",
|
||||
'Ugrave' => "\xc3\x99",
|
||||
'Ugrave;' => "\xc3\x99",
|
||||
'Upsilon;' => "\xce\xa5",
|
||||
'Uuml' => "\xc3\x9c",
|
||||
'Uuml;' => "\xc3\x9c",
|
||||
'Xi;' => "\xce\x9e",
|
||||
'Yacute' => "\xc3\x9d",
|
||||
'Yacute;' => "\xc3\x9d",
|
||||
'Yuml;' => "\xc5\xb8",
|
||||
'Zeta;' => "\xce\x96",
|
||||
'aacute' => "\xc3\xa1",
|
||||
'aacute;' => "\xc3\xa1",
|
||||
'acirc' => "\xc3\xa2",
|
||||
'acirc;' => "\xc3\xa2",
|
||||
'acute' => "\xc2\xb4",
|
||||
'acute;' => "\xc2\xb4",
|
||||
'aelig' => "\xc3\xa6",
|
||||
'aelig;' => "\xc3\xa6",
|
||||
'agrave' => "\xc3\xa0",
|
||||
'agrave;' => "\xc3\xa0",
|
||||
'alefsym;' => "\xe2\x84\xb5",
|
||||
'alpha;' => "\xce\xb1",
|
||||
'amp' => '&',
|
||||
'amp;' => '&',
|
||||
'and;' => "\xe2\x88\xa7",
|
||||
'ang;' => "\xe2\x88\xa0",
|
||||
'apos;' => "'",
|
||||
'aring' => "\xc3\xa5",
|
||||
'aring;' => "\xc3\xa5",
|
||||
'asymp;' => "\xe2\x89\x88",
|
||||
'atilde' => "\xc3\xa3",
|
||||
'atilde;' => "\xc3\xa3",
|
||||
'auml' => "\xc3\xa4",
|
||||
'auml;' => "\xc3\xa4",
|
||||
'bdquo;' => "\xe2\x80\x9e",
|
||||
'beta;' => "\xce\xb2",
|
||||
'brvbar' => "\xc2\xa6",
|
||||
'brvbar;' => "\xc2\xa6",
|
||||
'bull;' => "\xe2\x80\xa2",
|
||||
'cap;' => "\xe2\x88\xa9",
|
||||
'ccedil' => "\xc3\xa7",
|
||||
'ccedil;' => "\xc3\xa7",
|
||||
'cedil' => "\xc2\xb8",
|
||||
'cedil;' => "\xc2\xb8",
|
||||
'cent' => "\xc2\xa2",
|
||||
'cent;' => "\xc2\xa2",
|
||||
'chi;' => "\xcf\x87",
|
||||
'circ;' => "\xcb\x86",
|
||||
'clubs;' => "\xe2\x99\xa3",
|
||||
'cong;' => "\xe2\x89\x85",
|
||||
'copy' => "\xc2\xa9",
|
||||
'copy;' => "\xc2\xa9",
|
||||
'crarr;' => "\xe2\x86\xb5",
|
||||
'cup;' => "\xe2\x88\xaa",
|
||||
'curren' => "\xc2\xa4",
|
||||
'curren;' => "\xc2\xa4",
|
||||
'dArr;' => "\xe2\x87\x93",
|
||||
'dagger;' => "\xe2\x80\xa0",
|
||||
'darr;' => "\xe2\x86\x93",
|
||||
'deg' => "\xc2\xb0",
|
||||
'deg;' => "\xc2\xb0",
|
||||
'delta;' => "\xce\xb4",
|
||||
'diams;' => "\xe2\x99\xa6",
|
||||
'divide' => "\xc3\xb7",
|
||||
'divide;' => "\xc3\xb7",
|
||||
'eacute' => "\xc3\xa9",
|
||||
'eacute;' => "\xc3\xa9",
|
||||
'ecirc' => "\xc3\xaa",
|
||||
'ecirc;' => "\xc3\xaa",
|
||||
'egrave' => "\xc3\xa8",
|
||||
'egrave;' => "\xc3\xa8",
|
||||
'empty;' => "\xe2\x88\x85",
|
||||
'emsp;' => "\xe2\x80\x83",
|
||||
'ensp;' => "\xe2\x80\x82",
|
||||
'epsilon;' => "\xce\xb5",
|
||||
'equiv;' => "\xe2\x89\xa1",
|
||||
'eta;' => "\xce\xb7",
|
||||
'eth' => "\xc3\xb0",
|
||||
'eth;' => "\xc3\xb0",
|
||||
'euml' => "\xc3\xab",
|
||||
'euml;' => "\xc3\xab",
|
||||
'euro;' => "\xe2\x82\xac",
|
||||
'exist;' => "\xe2\x88\x83",
|
||||
'fnof;' => "\xc6\x92",
|
||||
'forall;' => "\xe2\x88\x80",
|
||||
'frac12' => "\xc2\xbd",
|
||||
'frac12;' => "\xc2\xbd",
|
||||
'frac14' => "\xc2\xbc",
|
||||
'frac14;' => "\xc2\xbc",
|
||||
'frac34' => "\xc2\xbe",
|
||||
'frac34;' => "\xc2\xbe",
|
||||
'frasl;' => "\xe2\x81\x84",
|
||||
'gamma;' => "\xce\xb3",
|
||||
'ge;' => "\xe2\x89\xa5",
|
||||
'gt' => '>',
|
||||
'gt;' => '>',
|
||||
'hArr;' => "\xe2\x87\x94",
|
||||
'harr;' => "\xe2\x86\x94",
|
||||
'hearts;' => "\xe2\x99\xa5",
|
||||
'hellip;' => "\xe2\x80\xa6",
|
||||
'iacute' => "\xc3\xad",
|
||||
'iacute;' => "\xc3\xad",
|
||||
'icirc' => "\xc3\xae",
|
||||
'icirc;' => "\xc3\xae",
|
||||
'iexcl' => "\xc2\xa1",
|
||||
'iexcl;' => "\xc2\xa1",
|
||||
'igrave' => "\xc3\xac",
|
||||
'igrave;' => "\xc3\xac",
|
||||
'image;' => "\xe2\x84\x91",
|
||||
'infin;' => "\xe2\x88\x9e",
|
||||
'int;' => "\xe2\x88\xab",
|
||||
'iota;' => "\xce\xb9",
|
||||
'iquest' => "\xc2\xbf",
|
||||
'iquest;' => "\xc2\xbf",
|
||||
'isin;' => "\xe2\x88\x88",
|
||||
'iuml' => "\xc3\xaf",
|
||||
'iuml;' => "\xc3\xaf",
|
||||
'kappa;' => "\xce\xba",
|
||||
'lArr;' => "\xe2\x87\x90",
|
||||
'lambda;' => "\xce\xbb",
|
||||
'lang;' => "\xe3\x80\x88",
|
||||
'laquo' => "\xc2\xab",
|
||||
'laquo;' => "\xc2\xab",
|
||||
'larr;' => "\xe2\x86\x90",
|
||||
'lceil;' => "\xe2\x8c\x88",
|
||||
'ldquo;' => "\xe2\x80\x9c",
|
||||
'le;' => "\xe2\x89\xa4",
|
||||
'lfloor;' => "\xe2\x8c\x8a",
|
||||
'lowast;' => "\xe2\x88\x97",
|
||||
'loz;' => "\xe2\x97\x8a",
|
||||
'lrm;' => "\xe2\x80\x8e",
|
||||
'lsaquo;' => "\xe2\x80\xb9",
|
||||
'lsquo;' => "\xe2\x80\x98",
|
||||
'lt' => '<',
|
||||
'lt;' => '<',
|
||||
'macr' => "\xc2\xaf",
|
||||
'macr;' => "\xc2\xaf",
|
||||
'mdash;' => "\xe2\x80\x94",
|
||||
'micro' => "\xc2\xb5",
|
||||
'micro;' => "\xc2\xb5",
|
||||
'middot' => "\xc2\xb7",
|
||||
'middot;' => "\xc2\xb7",
|
||||
'minus;' => "\xe2\x88\x92",
|
||||
'mu;' => "\xce\xbc",
|
||||
'nabla;' => "\xe2\x88\x87",
|
||||
'nbsp' => "\xc2\xa0",
|
||||
'nbsp;' => "\xc2\xa0",
|
||||
'ndash;' => "\xe2\x80\x93",
|
||||
'ne;' => "\xe2\x89\xa0",
|
||||
'ni;' => "\xe2\x88\x8b",
|
||||
'not' => "\xc2\xac",
|
||||
'not;' => "\xc2\xac",
|
||||
'notin;' => "\xe2\x88\x89",
|
||||
'nsub;' => "\xe2\x8a\x84",
|
||||
'ntilde' => "\xc3\xb1",
|
||||
'ntilde;' => "\xc3\xb1",
|
||||
'nu;' => "\xce\xbd",
|
||||
'oacute' => "\xc3\xb3",
|
||||
'oacute;' => "\xc3\xb3",
|
||||
'ocirc' => "\xc3\xb4",
|
||||
'ocirc;' => "\xc3\xb4",
|
||||
'oelig;' => "\xc5\x93",
|
||||
'ograve' => "\xc3\xb2",
|
||||
'ograve;' => "\xc3\xb2",
|
||||
'oline;' => "\xe2\x80\xbe",
|
||||
'omega;' => "\xcf\x89",
|
||||
'omicron;' => "\xce\xbf",
|
||||
'oplus;' => "\xe2\x8a\x95",
|
||||
'or;' => "\xe2\x88\xa8",
|
||||
'ordf' => "\xc2\xaa",
|
||||
'ordf;' => "\xc2\xaa",
|
||||
'ordm' => "\xc2\xba",
|
||||
'ordm;' => "\xc2\xba",
|
||||
'oslash' => "\xc3\xb8",
|
||||
'oslash;' => "\xc3\xb8",
|
||||
'otilde' => "\xc3\xb5",
|
||||
'otilde;' => "\xc3\xb5",
|
||||
'otimes;' => "\xe2\x8a\x97",
|
||||
'ouml' => "\xc3\xb6",
|
||||
'ouml;' => "\xc3\xb6",
|
||||
'para' => "\xc2\xb6",
|
||||
'para;' => "\xc2\xb6",
|
||||
'part;' => "\xe2\x88\x82",
|
||||
'permil;' => "\xe2\x80\xb0",
|
||||
'perp;' => "\xe2\x8a\xa5",
|
||||
'phi;' => "\xcf\x86",
|
||||
'pi;' => "\xcf\x80",
|
||||
'piv;' => "\xcf\x96",
|
||||
'plusmn' => "\xc2\xb1",
|
||||
'plusmn;' => "\xc2\xb1",
|
||||
'pound' => "\xc2\xa3",
|
||||
'pound;' => "\xc2\xa3",
|
||||
'prime;' => "\xe2\x80\xb2",
|
||||
'prod;' => "\xe2\x88\x8f",
|
||||
'prop;' => "\xe2\x88\x9d",
|
||||
'psi;' => "\xcf\x88",
|
||||
'quot' => '"',
|
||||
'quot;' => '"',
|
||||
'rArr;' => "\xe2\x87\x92",
|
||||
'radic;' => "\xe2\x88\x9a",
|
||||
'rang;' => "\xe3\x80\x89",
|
||||
'raquo' => "\xc2\xbb",
|
||||
'raquo;' => "\xc2\xbb",
|
||||
'rarr;' => "\xe2\x86\x92",
|
||||
'rceil;' => "\xe2\x8c\x89",
|
||||
'rdquo;' => "\xe2\x80\x9d",
|
||||
'real;' => "\xe2\x84\x9c",
|
||||
'reg' => "\xc2\xae",
|
||||
'reg;' => "\xc2\xae",
|
||||
'rfloor;' => "\xe2\x8c\x8b",
|
||||
'rho;' => "\xcf\x81",
|
||||
'rlm;' => "\xe2\x80\x8f",
|
||||
'rsaquo;' => "\xe2\x80\xba",
|
||||
'rsquo;' => "\xe2\x80\x99",
|
||||
'sbquo;' => "\xe2\x80\x9a",
|
||||
'scaron;' => "\xc5\xa1",
|
||||
'sdot;' => "\xe2\x8b\x85",
|
||||
'sect' => "\xc2\xa7",
|
||||
'sect;' => "\xc2\xa7",
|
||||
'shy' => "\xc2\xad",
|
||||
'shy;' => "\xc2\xad",
|
||||
'sigma;' => "\xcf\x83",
|
||||
'sigmaf;' => "\xcf\x82",
|
||||
'sim;' => "\xe2\x88\xbc",
|
||||
'spades;' => "\xe2\x99\xa0",
|
||||
'sub;' => "\xe2\x8a\x82",
|
||||
'sube;' => "\xe2\x8a\x86",
|
||||
'sum;' => "\xe2\x88\x91",
|
||||
'sup1' => "\xc2\xb9",
|
||||
'sup1;' => "\xc2\xb9",
|
||||
'sup2' => "\xc2\xb2",
|
||||
'sup2;' => "\xc2\xb2",
|
||||
'sup3' => "\xc2\xb3",
|
||||
'sup3;' => "\xc2\xb3",
|
||||
'sup;' => "\xe2\x8a\x83",
|
||||
'supe;' => "\xe2\x8a\x87",
|
||||
'szlig' => "\xc3\x9f",
|
||||
'szlig;' => "\xc3\x9f",
|
||||
'tau;' => "\xcf\x84",
|
||||
'there4;' => "\xe2\x88\xb4",
|
||||
'theta;' => "\xce\xb8",
|
||||
'thetasym;' => "\xcf\x91",
|
||||
'thinsp;' => "\xe2\x80\x89",
|
||||
'thorn' => "\xc3\xbe",
|
||||
'thorn;' => "\xc3\xbe",
|
||||
'tilde;' => "\xcb\x9c",
|
||||
'times' => "\xc3\x97",
|
||||
'times;' => "\xc3\x97",
|
||||
'trade;' => "\xe2\x84\xa2",
|
||||
'uArr;' => "\xe2\x87\x91",
|
||||
'uacute' => "\xc3\xba",
|
||||
'uacute;' => "\xc3\xba",
|
||||
'uarr;' => "\xe2\x86\x91",
|
||||
'ucirc' => "\xc3\xbb",
|
||||
'ucirc;' => "\xc3\xbb",
|
||||
'ugrave' => "\xc3\xb9",
|
||||
'ugrave;' => "\xc3\xb9",
|
||||
'uml' => "\xc2\xa8",
|
||||
'uml;' => "\xc2\xa8",
|
||||
'upsih;' => "\xcf\x92",
|
||||
'upsilon;' => "\xcf\x85",
|
||||
'uuml' => "\xc3\xbc",
|
||||
'uuml;' => "\xc3\xbc",
|
||||
'weierp;' => "\xe2\x84\x98",
|
||||
'xi;' => "\xce\xbe",
|
||||
'yacute' => "\xc3\xbd",
|
||||
'yacute;' => "\xc3\xbd",
|
||||
'yen' => "\xc2\xa5",
|
||||
'yen;' => "\xc2\xa5",
|
||||
'yuml' => "\xc3\xbf",
|
||||
'yuml;' => "\xc3\xbf",
|
||||
'zeta;' => "\xce\xb6",
|
||||
'zwj;' => "\xe2\x80\x8d",
|
||||
'zwnj;' => "\xe2\x80\x8c"
|
||||
}
|
||||
|
||||
ENCODINGS = %w[
|
||||
ansi_x3.4-1968
|
||||
iso-ir-6
|
||||
ansi_x3.4-1986
|
||||
iso_646.irv:1991
|
||||
ascii
|
||||
iso646-us
|
||||
us-ascii
|
||||
us
|
||||
ibm367
|
||||
cp367
|
||||
csascii
|
||||
ks_c_5601-1987
|
||||
korean
|
||||
iso-2022-kr
|
||||
csiso2022kr
|
||||
euc-kr
|
||||
iso-2022-jp
|
||||
csiso2022jp
|
||||
iso-2022-jp-2
|
||||
iso-ir-58
|
||||
chinese
|
||||
csiso58gb231280
|
||||
iso_8859-1:1987
|
||||
iso-ir-100
|
||||
iso_8859-1
|
||||
iso-8859-1
|
||||
latin1
|
||||
l1
|
||||
ibm819
|
||||
cp819
|
||||
csisolatin1
|
||||
iso_8859-2:1987
|
||||
iso-ir-101
|
||||
iso_8859-2
|
||||
iso-8859-2
|
||||
latin2
|
||||
l2
|
||||
csisolatin2
|
||||
iso_8859-3:1988
|
||||
iso-ir-109
|
||||
iso_8859-3
|
||||
iso-8859-3
|
||||
latin3
|
||||
l3
|
||||
csisolatin3
|
||||
iso_8859-4:1988
|
||||
iso-ir-110
|
||||
iso_8859-4
|
||||
iso-8859-4
|
||||
latin4
|
||||
l4
|
||||
csisolatin4
|
||||
iso_8859-6:1987
|
||||
iso-ir-127
|
||||
iso_8859-6
|
||||
iso-8859-6
|
||||
ecma-114
|
||||
asmo-708
|
||||
arabic
|
||||
csisolatinarabic
|
||||
iso_8859-7:1987
|
||||
iso-ir-126
|
||||
iso_8859-7
|
||||
iso-8859-7
|
||||
elot_928
|
||||
ecma-118
|
||||
greek
|
||||
greek8
|
||||
csisolatingreek
|
||||
iso_8859-8:1988
|
||||
iso-ir-138
|
||||
iso_8859-8
|
||||
iso-8859-8
|
||||
hebrew
|
||||
csisolatinhebrew
|
||||
iso_8859-5:1988
|
||||
iso-ir-144
|
||||
iso_8859-5
|
||||
iso-8859-5
|
||||
cyrillic
|
||||
csisolatincyrillic
|
||||
iso_8859-9:1989
|
||||
iso-ir-148
|
||||
iso_8859-9
|
||||
iso-8859-9
|
||||
latin5
|
||||
l5
|
||||
csisolatin5
|
||||
iso-8859-10
|
||||
iso-ir-157
|
||||
l6
|
||||
iso_8859-10:1992
|
||||
csisolatin6
|
||||
latin6
|
||||
hp-roman8
|
||||
roman8
|
||||
r8
|
||||
ibm037
|
||||
cp037
|
||||
csibm037
|
||||
ibm424
|
||||
cp424
|
||||
csibm424
|
||||
ibm437
|
||||
cp437
|
||||
437
|
||||
cspc8codepage437
|
||||
ibm500
|
||||
cp500
|
||||
csibm500
|
||||
ibm775
|
||||
cp775
|
||||
cspc775baltic
|
||||
ibm850
|
||||
cp850
|
||||
850
|
||||
cspc850multilingual
|
||||
ibm852
|
||||
cp852
|
||||
852
|
||||
cspcp852
|
||||
ibm855
|
||||
cp855
|
||||
855
|
||||
csibm855
|
||||
ibm857
|
||||
cp857
|
||||
857
|
||||
csibm857
|
||||
ibm860
|
||||
cp860
|
||||
860
|
||||
csibm860
|
||||
ibm861
|
||||
cp861
|
||||
861
|
||||
cp-is
|
||||
csibm861
|
||||
ibm862
|
||||
cp862
|
||||
862
|
||||
cspc862latinhebrew
|
||||
ibm863
|
||||
cp863
|
||||
863
|
||||
csibm863
|
||||
ibm864
|
||||
cp864
|
||||
csibm864
|
||||
ibm865
|
||||
cp865
|
||||
865
|
||||
csibm865
|
||||
ibm866
|
||||
cp866
|
||||
866
|
||||
csibm866
|
||||
ibm869
|
||||
cp869
|
||||
869
|
||||
cp-gr
|
||||
csibm869
|
||||
ibm1026
|
||||
cp1026
|
||||
csibm1026
|
||||
koi8-r
|
||||
cskoi8r
|
||||
koi8-u
|
||||
big5-hkscs
|
||||
ptcp154
|
||||
csptcp154
|
||||
pt154
|
||||
cp154
|
||||
utf-7
|
||||
utf-16be
|
||||
utf-16le
|
||||
utf-16
|
||||
utf-8
|
||||
iso-8859-13
|
||||
iso-8859-14
|
||||
iso-ir-199
|
||||
iso_8859-14:1998
|
||||
iso_8859-14
|
||||
latin8
|
||||
iso-celtic
|
||||
l8
|
||||
iso-8859-15
|
||||
iso_8859-15
|
||||
iso-8859-16
|
||||
iso-ir-226
|
||||
iso_8859-16:2001
|
||||
iso_8859-16
|
||||
latin10
|
||||
l10
|
||||
gbk
|
||||
cp936
|
||||
ms936
|
||||
gb18030
|
||||
shift_jis
|
||||
ms_kanji
|
||||
csshiftjis
|
||||
euc-jp
|
||||
gb2312
|
||||
big5
|
||||
csbig5
|
||||
windows-1250
|
||||
windows-1251
|
||||
windows-1252
|
||||
windows-1253
|
||||
windows-1254
|
||||
windows-1255
|
||||
windows-1256
|
||||
windows-1257
|
||||
windows-1258
|
||||
tis-620
|
||||
hz-gb-2312
|
||||
]
|
||||
|
||||
end
|
1
vendor/plugins/HTML5lib/lib/html5/filters.rb
vendored
Normal file
1
vendor/plugins/HTML5lib/lib/html5/filters.rb
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
require 'html5/filters/optionaltags'
|
|
@ -1,7 +1,7 @@
|
|||
require 'delegate'
|
||||
require 'enumerator'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module Filters
|
||||
class Base < SimpleDelegator
|
||||
include Enumerable
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/filters/base'
|
||||
require 'html5/filters/base'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module Filters
|
||||
class InjectMetaCharset < Base
|
||||
def initialize(source, encoding)
|
|
@ -1,7 +1,7 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/filters/base'
|
||||
require 'html5/constants'
|
||||
require 'html5/filters/base'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module Filters
|
||||
|
||||
class OptionalTagFilter < Base
|
|
@ -1,7 +1,7 @@
|
|||
require 'html5lib/filters/base'
|
||||
require 'html5lib/sanitizer'
|
||||
require 'html5/filters/base'
|
||||
require 'html5/sanitizer'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module Filters
|
||||
class HTMLSanitizeFilter < Base
|
||||
include HTMLSanitizeModule
|
|
@ -1,7 +1,7 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/filters/base'
|
||||
require 'html5/constants'
|
||||
require 'html5/filters/base'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module Filters
|
||||
class WhitespaceFilter < Base
|
||||
|
|
@ -1,246 +1,246 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/tokenizer'
|
||||
require 'html5lib/treebuilders/rexml'
|
||||
|
||||
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
|
||||
require 'html5lib/html5parser/' + File.basename(path)
|
||||
end
|
||||
|
||||
module HTML5lib
|
||||
|
||||
# Error in parsed document
|
||||
class ParseError < Exception; end
|
||||
class AssertionError < Exception; end
|
||||
|
||||
# HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
|
||||
#
|
||||
class HTMLParser
|
||||
|
||||
attr_accessor :phase, :firstStartTag, :innerHTML, :lastPhase, :insertFromTable
|
||||
|
||||
attr_reader :phases, :tokenizer, :tree, :errors
|
||||
|
||||
def self.parse(stream, options = {})
|
||||
encoding = options.delete(:encoding)
|
||||
new(options).parse(stream,encoding)
|
||||
end
|
||||
|
||||
def self.parseFragment(stream, options = {})
|
||||
container = options.delete(:container) || 'div'
|
||||
encoding = options.delete(:encoding)
|
||||
new(options).parseFragment(stream,container,encoding)
|
||||
end
|
||||
|
||||
@@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
|
||||
inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
|
||||
|
||||
# :strict - raise an exception when a parse error is encountered
|
||||
# :tree - a treebuilder class controlling the type of tree that will be
|
||||
# returned. Built in treebuilders can be accessed through
|
||||
# HTML5lib::TreeBuilders[treeType]
|
||||
def initialize(options = {})
|
||||
@strict = false
|
||||
@errors = []
|
||||
|
||||
@tokenizer = HTMLTokenizer
|
||||
@tree = TreeBuilders::REXML::TreeBuilder
|
||||
|
||||
options.each { |name, value| instance_variable_set("@#{name}", value) }
|
||||
|
||||
@tree = @tree.new
|
||||
|
||||
@phases = @@phases.inject({}) do |phases, phase_name|
|
||||
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
|
||||
phases[phase_name.to_sym] = HTML5lib.const_get(phase_class_name).new(self, @tree)
|
||||
phases
|
||||
end
|
||||
end
|
||||
|
||||
def _parse(stream, innerHTML, encoding, container = 'div')
|
||||
@tree.reset
|
||||
@firstStartTag = false
|
||||
@errors = []
|
||||
|
||||
@tokenizer = @tokenizer.class unless Class === @tokenizer
|
||||
@tokenizer = @tokenizer.new(stream, :encoding => encoding,
|
||||
:parseMeta => !innerHTML)
|
||||
|
||||
if innerHTML
|
||||
case @innerHTML = container.downcase
|
||||
when 'title', 'textarea'
|
||||
@tokenizer.contentModelFlag = :RCDATA
|
||||
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
|
||||
@tokenizer.contentModelFlag = :CDATA
|
||||
when 'plaintext'
|
||||
@tokenizer.contentModelFlag = :PLAINTEXT
|
||||
else
|
||||
# contentModelFlag already is PCDATA
|
||||
#@tokenizer.contentModelFlag = :PCDATA
|
||||
end
|
||||
|
||||
@phase = @phases[:rootElement]
|
||||
@phase.insertHtmlElement
|
||||
resetInsertionMode
|
||||
else
|
||||
@innerHTML = false
|
||||
@phase = @phases[:initial]
|
||||
end
|
||||
|
||||
# We only seem to have InBodyPhase testcases where the following is
|
||||
# relevant ... need others too
|
||||
@lastPhase = nil
|
||||
|
||||
# XXX This is temporary for the moment so there isn't any other
|
||||
# changes needed for the parser to work with the iterable tokenizer
|
||||
@tokenizer.each do |token|
|
||||
token = normalizeToken(token)
|
||||
|
||||
method = 'process%s' % token[:type]
|
||||
|
||||
case token[:type]
|
||||
when :Characters, :SpaceCharacters, :Comment
|
||||
@phase.send method, token[:data]
|
||||
when :StartTag
|
||||
@phase.send method, token[:name], token[:data]
|
||||
when :EndTag
|
||||
@phase.send method, token[:name]
|
||||
when :Doctype
|
||||
@phase.send method, token[:name], token[:publicId],
|
||||
token[:systemId], token[:correct]
|
||||
else
|
||||
parseError(token[:data])
|
||||
end
|
||||
end
|
||||
|
||||
# When the loop finishes it's EOF
|
||||
@phase.processEOF
|
||||
end
|
||||
|
||||
# Parse a HTML document into a well-formed tree
|
||||
#
|
||||
# stream - a filelike object or string containing the HTML to be parsed
|
||||
#
|
||||
# The optional encoding parameter must be a string that indicates
|
||||
# the encoding. If specified, that encoding will be used,
|
||||
# regardless of any BOM or later declaration (such as in a meta
|
||||
# element)
|
||||
def parse(stream, encoding=nil)
|
||||
_parse(stream, false, encoding)
|
||||
return @tree.getDocument
|
||||
end
|
||||
|
||||
# Parse a HTML fragment into a well-formed tree fragment
|
||||
|
||||
# container - name of the element we're setting the innerHTML property
|
||||
# if set to nil, default to 'div'
|
||||
#
|
||||
# stream - a filelike object or string containing the HTML to be parsed
|
||||
#
|
||||
# The optional encoding parameter must be a string that indicates
|
||||
# the encoding. If specified, that encoding will be used,
|
||||
# regardless of any BOM or later declaration (such as in a meta
|
||||
# element)
|
||||
def parseFragment(stream, container='div', encoding=nil)
|
||||
_parse(stream, true, encoding, container)
|
||||
return @tree.getFragment
|
||||
end
|
||||
|
||||
def parseError(data = 'XXX ERROR MESSAGE NEEDED')
|
||||
# XXX The idea is to make data mandatory.
|
||||
@errors.push([@tokenizer.stream.position, data])
|
||||
raise ParseError if @strict
|
||||
end
|
||||
|
||||
# HTML5 specific normalizations to the token stream
|
||||
def normalizeToken(token)
|
||||
|
||||
if token[:type] == :EmptyTag
|
||||
# When a solidus (/) is encountered within a tag name what happens
|
||||
# depends on whether the current tag name matches that of a void
|
||||
# element. If it matches a void element atheists did the wrong
|
||||
# thing and if it doesn't it's wrong for everyone.
|
||||
|
||||
unless VOID_ELEMENTS.include?(token[:name])
|
||||
parseError(_('Solidus (/) incorrectly placed in tag.'))
|
||||
end
|
||||
|
||||
token[:type] = :StartTag
|
||||
end
|
||||
|
||||
if token[:type] == :StartTag
|
||||
token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
|
||||
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
unless token[:data].empty?
|
||||
data = token[:data].reverse.map { |attr, value| [attr.tr(ASCII_UPPERCASE, ASCII_LOWERCASE), value] }
|
||||
token[:data] = Hash[*data.flatten]
|
||||
end
|
||||
|
||||
elsif token[:type] == :EndTag
|
||||
parseError(_('End tag contains unexpected attributes.')) unless token[:data].empty?
|
||||
token[:name] = token[:name].downcase
|
||||
end
|
||||
|
||||
return token
|
||||
end
|
||||
|
||||
@@new_modes = {
|
||||
'select' => :inSelect,
|
||||
'td' => :inCell,
|
||||
'th' => :inCell,
|
||||
'tr' => :inRow,
|
||||
'tbody' => :inTableBody,
|
||||
'thead' => :inTableBody,
|
||||
'tfoot' => :inTableBody,
|
||||
'caption' => :inCaption,
|
||||
'colgroup' => :inColumnGroup,
|
||||
'table' => :inTable,
|
||||
'head' => :inBody,
|
||||
'body' => :inBody,
|
||||
'frameset' => :inFrameset
|
||||
}
|
||||
|
||||
def resetInsertionMode
|
||||
# The name of this method is mostly historical. (It's also used in the
|
||||
# specification.)
|
||||
last = false
|
||||
|
||||
@tree.openElements.reverse.each do |node|
|
||||
nodeName = node.name
|
||||
|
||||
if node == @tree.openElements[0]
|
||||
last = true
|
||||
unless ['td', 'th'].include?(nodeName)
|
||||
# XXX
|
||||
# assert @innerHTML
|
||||
nodeName = @innerHTML
|
||||
end
|
||||
end
|
||||
|
||||
# Check for conditions that should only happen in the innerHTML
|
||||
# case
|
||||
if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName)
|
||||
# XXX
|
||||
# assert @innerHTML
|
||||
end
|
||||
|
||||
if @@new_modes.has_key?(nodeName)
|
||||
@phase = @phases[@@new_modes[nodeName]]
|
||||
elsif nodeName == 'html'
|
||||
@phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead]
|
||||
elsif last
|
||||
@phase = @phases[:inBody]
|
||||
else
|
||||
next
|
||||
end
|
||||
|
||||
break
|
||||
end
|
||||
end
|
||||
|
||||
def _(string); string; end
|
||||
end
|
||||
|
||||
end
|
||||
require 'html5/constants'
|
||||
require 'html5/tokenizer'
|
||||
require 'html5/treebuilders/rexml'
|
||||
|
||||
Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path|
|
||||
require 'html5/html5parser/' + File.basename(path)
|
||||
end
|
||||
|
||||
module HTML5
|
||||
|
||||
# Error in parsed document
|
||||
class ParseError < Exception; end
|
||||
class AssertionError < Exception; end
|
||||
|
||||
# HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
|
||||
#
|
||||
class HTMLParser
|
||||
|
||||
attr_accessor :phase, :firstStartTag, :innerHTML, :lastPhase, :insertFromTable
|
||||
|
||||
attr_reader :phases, :tokenizer, :tree, :errors
|
||||
|
||||
def self.parse(stream, options = {})
|
||||
encoding = options.delete(:encoding)
|
||||
new(options).parse(stream,encoding)
|
||||
end
|
||||
|
||||
def self.parseFragment(stream, options = {})
|
||||
container = options.delete(:container) || 'div'
|
||||
encoding = options.delete(:encoding)
|
||||
new(options).parseFragment(stream,container,encoding)
|
||||
end
|
||||
|
||||
@@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption
|
||||
inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd )
|
||||
|
||||
# :strict - raise an exception when a parse error is encountered
|
||||
# :tree - a treebuilder class controlling the type of tree that will be
|
||||
# returned. Built in treebuilders can be accessed through
|
||||
# HTML5::TreeBuilders[treeType]
|
||||
def initialize(options = {})
|
||||
@strict = false
|
||||
@errors = []
|
||||
|
||||
@tokenizer = HTMLTokenizer
|
||||
@tree = TreeBuilders::REXML::TreeBuilder
|
||||
|
||||
options.each { |name, value| instance_variable_set("@#{name}", value) }
|
||||
|
||||
@tree = @tree.new
|
||||
|
||||
@phases = @@phases.inject({}) do |phases, phase_name|
|
||||
phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
|
||||
phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
|
||||
phases
|
||||
end
|
||||
end
|
||||
|
||||
def _parse(stream, innerHTML, encoding, container = 'div')
|
||||
@tree.reset
|
||||
@firstStartTag = false
|
||||
@errors = []
|
||||
|
||||
@tokenizer = @tokenizer.class unless Class === @tokenizer
|
||||
@tokenizer = @tokenizer.new(stream, :encoding => encoding,
|
||||
:parseMeta => !innerHTML)
|
||||
|
||||
if innerHTML
|
||||
case @innerHTML = container.downcase
|
||||
when 'title', 'textarea'
|
||||
@tokenizer.contentModelFlag = :RCDATA
|
||||
when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
|
||||
@tokenizer.contentModelFlag = :CDATA
|
||||
when 'plaintext'
|
||||
@tokenizer.contentModelFlag = :PLAINTEXT
|
||||
else
|
||||
# contentModelFlag already is PCDATA
|
||||
#@tokenizer.contentModelFlag = :PCDATA
|
||||
end
|
||||
|
||||
@phase = @phases[:rootElement]
|
||||
@phase.insertHtmlElement
|
||||
resetInsertionMode
|
||||
else
|
||||
@innerHTML = false
|
||||
@phase = @phases[:initial]
|
||||
end
|
||||
|
||||
# We only seem to have InBodyPhase testcases where the following is
|
||||
# relevant ... need others too
|
||||
@lastPhase = nil
|
||||
|
||||
# XXX This is temporary for the moment so there isn't any other
|
||||
# changes needed for the parser to work with the iterable tokenizer
|
||||
@tokenizer.each do |token|
|
||||
token = normalizeToken(token)
|
||||
|
||||
method = 'process%s' % token[:type]
|
||||
|
||||
case token[:type]
|
||||
when :Characters, :SpaceCharacters, :Comment
|
||||
@phase.send method, token[:data]
|
||||
when :StartTag
|
||||
@phase.send method, token[:name], token[:data]
|
||||
when :EndTag
|
||||
@phase.send method, token[:name]
|
||||
when :Doctype
|
||||
@phase.send method, token[:name], token[:publicId],
|
||||
token[:systemId], token[:correct]
|
||||
else
|
||||
parseError(token[:data])
|
||||
end
|
||||
end
|
||||
|
||||
# When the loop finishes it's EOF
|
||||
@phase.processEOF
|
||||
end
|
||||
|
||||
# Parse a HTML document into a well-formed tree
|
||||
#
|
||||
# stream - a filelike object or string containing the HTML to be parsed
|
||||
#
|
||||
# The optional encoding parameter must be a string that indicates
|
||||
# the encoding. If specified, that encoding will be used,
|
||||
# regardless of any BOM or later declaration (such as in a meta
|
||||
# element)
|
||||
def parse(stream, encoding=nil)
|
||||
_parse(stream, false, encoding)
|
||||
return @tree.getDocument
|
||||
end
|
||||
|
||||
# Parse a HTML fragment into a well-formed tree fragment
|
||||
|
||||
# container - name of the element we're setting the innerHTML property
|
||||
# if set to nil, default to 'div'
|
||||
#
|
||||
# stream - a filelike object or string containing the HTML to be parsed
|
||||
#
|
||||
# The optional encoding parameter must be a string that indicates
|
||||
# the encoding. If specified, that encoding will be used,
|
||||
# regardless of any BOM or later declaration (such as in a meta
|
||||
# element)
|
||||
def parseFragment(stream, container='div', encoding=nil)
|
||||
_parse(stream, true, encoding, container)
|
||||
return @tree.getFragment
|
||||
end
|
||||
|
||||
def parseError(data = 'XXX ERROR MESSAGE NEEDED')
|
||||
# XXX The idea is to make data mandatory.
|
||||
@errors.push([@tokenizer.stream.position, data])
|
||||
raise ParseError if @strict
|
||||
end
|
||||
|
||||
# HTML5 specific normalizations to the token stream
|
||||
def normalizeToken(token)
|
||||
|
||||
if token[:type] == :EmptyTag
|
||||
# When a solidus (/) is encountered within a tag name what happens
|
||||
# depends on whether the current tag name matches that of a void
|
||||
# element. If it matches a void element atheists did the wrong
|
||||
# thing and if it doesn't it's wrong for everyone.
|
||||
|
||||
unless VOID_ELEMENTS.include?(token[:name])
|
||||
parseError(_('Solidus (/) incorrectly placed in tag.'))
|
||||
end
|
||||
|
||||
token[:type] = :StartTag
|
||||
end
|
||||
|
||||
if token[:type] == :StartTag
|
||||
token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
|
||||
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
unless token[:data].empty?
|
||||
data = token[:data].reverse.map { |attr, value| [attr.tr(ASCII_UPPERCASE, ASCII_LOWERCASE), value] }
|
||||
token[:data] = Hash[*data.flatten]
|
||||
end
|
||||
|
||||
elsif token[:type] == :EndTag
|
||||
parseError(_('End tag contains unexpected attributes.')) unless token[:data].empty?
|
||||
token[:name] = token[:name].downcase
|
||||
end
|
||||
|
||||
return token
|
||||
end
|
||||
|
||||
@@new_modes = {
|
||||
'select' => :inSelect,
|
||||
'td' => :inCell,
|
||||
'th' => :inCell,
|
||||
'tr' => :inRow,
|
||||
'tbody' => :inTableBody,
|
||||
'thead' => :inTableBody,
|
||||
'tfoot' => :inTableBody,
|
||||
'caption' => :inCaption,
|
||||
'colgroup' => :inColumnGroup,
|
||||
'table' => :inTable,
|
||||
'head' => :inBody,
|
||||
'body' => :inBody,
|
||||
'frameset' => :inFrameset
|
||||
}
|
||||
|
||||
def resetInsertionMode
|
||||
# The name of this method is mostly historical. (It's also used in the
|
||||
# specification.)
|
||||
last = false
|
||||
|
||||
@tree.openElements.reverse.each do |node|
|
||||
nodeName = node.name
|
||||
|
||||
if node == @tree.openElements[0]
|
||||
last = true
|
||||
unless ['td', 'th'].include?(nodeName)
|
||||
# XXX
|
||||
# assert @innerHTML
|
||||
nodeName = @innerHTML
|
||||
end
|
||||
end
|
||||
|
||||
# Check for conditions that should only happen in the innerHTML
|
||||
# case
|
||||
if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName)
|
||||
# XXX
|
||||
# assert @innerHTML
|
||||
end
|
||||
|
||||
if @@new_modes.has_key?(nodeName)
|
||||
@phase = @phases[@@new_modes[nodeName]]
|
||||
elsif nodeName == 'html'
|
||||
@phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead]
|
||||
elsif last
|
||||
@phase = @phases[:inBody]
|
||||
else
|
||||
next
|
||||
end
|
||||
|
||||
break
|
||||
end
|
||||
end
|
||||
|
||||
def _(string); string; end
|
||||
end
|
||||
|
||||
end
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class AfterBodyPhase < Phase
|
||||
|
||||
handle_end 'html'
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class AfterFramesetPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#after3
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class AfterHeadPhase < Phase
|
||||
|
||||
handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
|
|
@ -1,11 +1,11 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class BeforeHeadPhase < Phase
|
||||
|
||||
handle_start 'html', 'head'
|
||||
|
||||
handle_end %w( html head body br ) => 'ImplyHead'
|
||||
handle_end %w( html head body br p ) => 'ImplyHead'
|
||||
|
||||
def processEOF
|
||||
startTagHead('head', {})
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InBodyPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
|
||||
|
@ -112,7 +112,7 @@ module HTML5lib
|
|||
|
||||
def startTagForm(name, attributes)
|
||||
if @tree.formPointer
|
||||
@parser.parseError('Unexpected start tag (form). Ignored.')
|
||||
@parser.parseError(_('Unexpected start tag (form). Ignored.'))
|
||||
else
|
||||
endTagP('p') if in_scope?('p')
|
||||
@tree.insertElement(name, attributes)
|
||||
|
@ -129,9 +129,9 @@ module HTML5lib
|
|||
if stopName.include?(node.name)
|
||||
poppedNodes = (0..i).collect { @tree.openElements.pop }
|
||||
if i >= 1
|
||||
@parser.parseError("Missing end tag%s (%s)" % [
|
||||
@parser.parseError(_("Missing end tag%s (%s)" % [
|
||||
(i>1 ? 's' : ''),
|
||||
poppedNodes.reverse.map {|item| item.name}.join(', ')])
|
||||
poppedNodes.reverse.map {|item| item.name}.join(', ')]))
|
||||
end
|
||||
break
|
||||
end
|
||||
|
@ -251,7 +251,7 @@ module HTML5lib
|
|||
end
|
||||
|
||||
def startTagIsindex(name, attributes)
|
||||
@parser.parseError("Unexpected start tag isindex. Don't use it!")
|
||||
@parser.parseError(_("Unexpected start tag isindex. Don't use it!"))
|
||||
return if @tree.formPointer
|
||||
processStartTag('form', {})
|
||||
processStartTag('hr', {})
|
||||
|
@ -311,8 +311,13 @@ module HTML5lib
|
|||
|
||||
def endTagP(name)
|
||||
@tree.generateImpliedEndTags('p') if in_scope?('p')
|
||||
@parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p'
|
||||
@tree.openElements.pop while in_scope?('p')
|
||||
@parser.parseError(_('Unexpected end tag (p).')) unless @tree.openElements[-1].name == 'p'
|
||||
if in_scope?('p')
|
||||
@tree.openElements.pop while in_scope?('p')
|
||||
else
|
||||
startTagCloseP('p', {})
|
||||
endTagP('p')
|
||||
end
|
||||
end
|
||||
|
||||
def endTagBody(name)
|
||||
|
@ -342,7 +347,7 @@ module HTML5lib
|
|||
@tree.generateImpliedEndTags if in_scope?(name)
|
||||
|
||||
unless @tree.openElements[-1].name == name
|
||||
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
|
||||
@parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag."))
|
||||
end
|
||||
|
||||
if in_scope?(name)
|
||||
|
@ -351,7 +356,14 @@ module HTML5lib
|
|||
end
|
||||
|
||||
def endTagForm(name)
|
||||
endTagBlock(name)
|
||||
if in_scope?(name)
|
||||
@tree.generateImpliedEndTags
|
||||
end
|
||||
if @tree.openElements[-1].name != name
|
||||
@parser.parseError(_("End tag (form) seen too early. Ignored."))
|
||||
else
|
||||
@tree.openElements.pop
|
||||
end
|
||||
@tree.formPointer = nil
|
||||
end
|
||||
|
||||
|
@ -361,7 +373,7 @@ module HTML5lib
|
|||
@tree.generateImpliedEndTags(name)
|
||||
|
||||
unless @tree.openElements[-1].name == name
|
||||
@parser.parseError(("End tag (#{name}) seen too early. Expected other end tag."))
|
||||
@parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag."))
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -377,7 +389,7 @@ module HTML5lib
|
|||
end
|
||||
|
||||
unless @tree.openElements[-1].name == name
|
||||
@parser.parseError(("Unexpected end tag (#{name}). Expected other end tag."))
|
||||
@parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag."))
|
||||
end
|
||||
|
||||
HEADING_ELEMENTS.each do |element|
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InCaptionPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InCellPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InColumnGroupPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InFramesetPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
|
|
@ -1,12 +1,12 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InHeadPhase < Phase
|
||||
|
||||
handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta )
|
||||
|
||||
handle_end 'head'
|
||||
handle_end %w( html body br ) => 'ImplyAfterHead'
|
||||
handle_end %w( html body br p ) => 'ImplyAfterHead'
|
||||
handle_end %w( title style script )
|
||||
|
||||
def processEOF
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InRowPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InSelectPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InTableBodyPhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InTablePhase < Phase
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class InitialPhase < Phase
|
||||
|
||||
# This phase deals with error handling as well which is currently not
|
|
@ -1,4 +1,4 @@
|
|||
module HTML5lib
|
||||
module HTML5
|
||||
# Base class for helper objects that implement each phase of processing.
|
||||
#
|
||||
# Handler methods should be in the following order (they can be omitted):
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class RootElementPhase < Phase
|
||||
|
||||
def processEOF
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/html5parser/phase'
|
||||
require 'html5/html5parser/phase'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
class TrailingEndPhase < Phase
|
||||
|
||||
def processEOF
|
|
@ -1,7 +1,7 @@
|
|||
require 'stringio'
|
||||
require 'html5lib/constants'
|
||||
require 'html5/constants'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
# Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
|
@ -10,7 +10,7 @@ module HTML5lib
|
|||
|
||||
class HTMLInputStream
|
||||
|
||||
attr_accessor :queue, :char_encoding
|
||||
attr_accessor :queue, :char_encoding, :errors
|
||||
|
||||
# Initialises the HTMLInputStream.
|
||||
#
|
||||
|
@ -40,25 +40,31 @@ module HTML5lib
|
|||
#Number of bytes to use when looking for a meta element with
|
||||
#encoding information
|
||||
@NUM_BYTES_META = 512
|
||||
#Number of bytes to use when using detecting encoding using chardet
|
||||
@NUM_BYTES_CHARDET = 256
|
||||
#Number of bytes to use when reading content
|
||||
@NUM_BYTES_BUFFER = 1024
|
||||
|
||||
#Encoding to use if no other information can be found
|
||||
@DEFAULT_ENCODING = 'windows-1252'
|
||||
|
||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||
if @encoding.nil? or not HTML5lib.is_valid_encoding(@encoding)
|
||||
if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
|
||||
@char_encoding = detect_encoding
|
||||
else
|
||||
@char_encoding = @encoding
|
||||
end
|
||||
|
||||
# Read bytes from stream decoding them into Unicode
|
||||
uString = @raw_stream.read
|
||||
@buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
|
||||
if @char_encoding == 'windows-1252'
|
||||
@win1252 = true
|
||||
elsif @char_encoding != 'utf-8'
|
||||
begin
|
||||
require 'iconv'
|
||||
begin
|
||||
uString = Iconv.iconv('utf-8', @char_encoding, uString).first
|
||||
@buffer << @raw_stream.read unless @raw_stream.eof?
|
||||
@buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
|
||||
rescue
|
||||
@win1252 = true
|
||||
end
|
||||
|
@ -67,10 +73,8 @@ module HTML5lib
|
|||
end
|
||||
end
|
||||
|
||||
# Convert the unicode string into a list to be used as the data stream
|
||||
@data_stream = uString
|
||||
|
||||
@queue = []
|
||||
@errors = []
|
||||
|
||||
# Reset position in the list to read from
|
||||
@tell = 0
|
||||
|
@ -109,9 +113,22 @@ module HTML5lib
|
|||
begin
|
||||
require 'rubygems'
|
||||
require 'UniversalDetector' # gem install chardet
|
||||
buffer = @raw_stream.read
|
||||
encoding = UniversalDetector::chardet(buffer)['encoding']
|
||||
seek(buffer, 0)
|
||||
buffers = []
|
||||
detector = UniversalDetector::Detector.instance
|
||||
detector.reset
|
||||
until @raw_stream.eof?
|
||||
buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
|
||||
break if !buffer or buffer.empty?
|
||||
buffers << buffer
|
||||
detector.feed(buffer)
|
||||
break if detector.instance_eval {@done}
|
||||
detector.instance_eval {
|
||||
@_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
|
||||
}
|
||||
end
|
||||
detector.close
|
||||
encoding = detector.result['encoding']
|
||||
seek(buffers*'', 0)
|
||||
rescue LoadError
|
||||
end
|
||||
end
|
||||
|
@ -242,14 +259,20 @@ module HTML5lib
|
|||
unless @queue.empty?
|
||||
return @queue.shift
|
||||
else
|
||||
c = @data_stream[@tell]
|
||||
if @tell + 3 > @buffer.length and !@raw_stream.eof?
|
||||
# read next block
|
||||
@buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
|
||||
@tell = 0
|
||||
end
|
||||
|
||||
c = @buffer[@tell]
|
||||
@tell += 1
|
||||
|
||||
case c
|
||||
when 0x01 .. 0x7F
|
||||
if c == 0x0D
|
||||
# normalize newlines
|
||||
@tell += 1 if @data_stream[@tell] == 0x0A
|
||||
@tell += 1 if @buffer[@tell] == 0x0A
|
||||
c = 0x0A
|
||||
end
|
||||
|
||||
|
@ -276,7 +299,7 @@ module HTML5lib
|
|||
when 0xC0 .. 0xFF
|
||||
if @win1252
|
||||
"\xC3" + (c-64).chr # convert to utf-8
|
||||
elsif @data_stream[@tell-1 .. -1] =~ /^
|
||||
elsif @buffer[@tell-1 .. @tell+3] =~ /^
|
||||
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
||||
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
||||
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
||||
|
@ -292,6 +315,8 @@ module HTML5lib
|
|||
end
|
||||
|
||||
when 0x00
|
||||
@errors.push('null character found in input stream, ' +
|
||||
'replaced with U+FFFD')
|
||||
[0xFFFD].pack('U') # null characters are invalid
|
||||
|
||||
else
|
||||
|
@ -317,6 +342,10 @@ module HTML5lib
|
|||
@queue.insert(0, c) unless c == :EOF
|
||||
return char_stack.join('')
|
||||
end
|
||||
|
||||
def unget(characters)
|
||||
@queue.unshift(*characters.to_a) unless characters == :EOF
|
||||
end
|
||||
end
|
||||
|
||||
# String-like object with an assosiated position and various extra methods
|
||||
|
@ -433,14 +462,14 @@ module HTML5lib
|
|||
|
||||
if attr[0] == 'charset'
|
||||
tentative_encoding = attr[1]
|
||||
if HTML5lib.is_valid_encoding(tentative_encoding)
|
||||
if HTML5.is_valid_encoding(tentative_encoding)
|
||||
@encoding = tentative_encoding
|
||||
return false
|
||||
end
|
||||
elsif attr[0] == 'content'
|
||||
content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
|
||||
tentative_encoding = content_parser.parse
|
||||
if HTML5lib.is_valid_encoding(tentative_encoding)
|
||||
if HTML5.is_valid_encoding(tentative_encoding)
|
||||
@encoding = tentative_encoding
|
||||
return false
|
||||
end
|
|
@ -11,10 +11,10 @@
|
|||
#
|
||||
# @@TODO:
|
||||
# * Selectively lowercase only XHTML, but not foreign markup
|
||||
require 'html5lib/html5parser'
|
||||
require 'html5lib/constants'
|
||||
require 'html5/html5parser'
|
||||
require 'html5/constants'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
# liberal XML parser
|
||||
class XMLParser < HTMLParser
|
||||
|
@ -25,25 +25,35 @@ module HTML5lib
|
|||
end
|
||||
|
||||
def normalizeToken(token)
|
||||
if token[:type] == :StartTag or token[:type] == :EmptyTag
|
||||
case token[:type]
|
||||
when :StartTag, :EmptyTag
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
# to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
token[:data] = Hash[*token[:data].reverse.flatten]
|
||||
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token[:type] == :EmptyTag
|
||||
save = @tokenizer.contentModelFlag
|
||||
@phase.processStartTag(token[:name], token[:data])
|
||||
@tokenizer.contentModelFlag = save
|
||||
token[:data] = {}
|
||||
token[:type] = :EndTag
|
||||
end
|
||||
|
||||
elsif token[:type] == :EndTag
|
||||
when :Characters
|
||||
# un-escape RCDATA_ELEMENTS (e.g. style, script)
|
||||
if @tokenizer.contentModelFlag == :CDATA
|
||||
token[:data] = token[:data].
|
||||
gsub('<','<').gsub('>','>').gsub('&','&')
|
||||
end
|
||||
|
||||
when :EndTag
|
||||
if token[:data]
|
||||
parseError(_("End tag contains unexpected attributes."))
|
||||
end
|
||||
|
||||
elsif token[:type] == :Comment
|
||||
when :Comment
|
||||
# Rescue CDATA from the comments
|
||||
if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
|
||||
token[:type] = :Characters
|
|
@ -1,6 +1,7 @@
|
|||
require 'cgi'
|
||||
require 'html5/tokenizer'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
# This module provides sanitization of XHTML+MathML+SVG
|
||||
# and of inline style attributes.
|
2
vendor/plugins/HTML5lib/lib/html5/serializer.rb
vendored
Normal file
2
vendor/plugins/HTML5lib/lib/html5/serializer.rb
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
require 'html5/serializer/htmlserializer'
|
||||
require 'html5/serializer/xhtmlserializer'
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5/constants'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
class HTMLSerializer
|
||||
|
||||
|
@ -21,6 +21,7 @@ module HTML5lib
|
|||
@use_trailing_solidus = false
|
||||
@space_before_trailing_solidus = true
|
||||
@escape_lt_in_attrs = false
|
||||
@escape_rcdata = false
|
||||
|
||||
@omit_optional_tags = true
|
||||
@sanitize = false
|
||||
|
@ -43,22 +44,22 @@ module HTML5lib
|
|||
@errors = []
|
||||
|
||||
if encoding and @inject_meta_charset
|
||||
require 'html5lib/filters/inject_meta_charset'
|
||||
require 'html5/filters/inject_meta_charset'
|
||||
treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
|
||||
end
|
||||
|
||||
if @strip_whitespace
|
||||
require 'html5lib/filters/whitespace'
|
||||
require 'html5/filters/whitespace'
|
||||
treewalker = Filters::WhitespaceFilter.new(treewalker)
|
||||
end
|
||||
|
||||
if @sanitize
|
||||
require 'html5lib/filters/sanitizer'
|
||||
require 'html5/filters/sanitizer'
|
||||
treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
|
||||
end
|
||||
|
||||
if @omit_optional_tags
|
||||
require 'html5lib/filters/optionaltags'
|
||||
require 'html5/filters/optionaltags'
|
||||
treewalker = Filters::OptionalTagFilter.new(treewalker)
|
||||
end
|
||||
|
||||
|
@ -81,7 +82,7 @@ module HTML5lib
|
|||
|
||||
elsif [:StartTag, :EmptyTag].include? type
|
||||
name = token[:name]
|
||||
if RCDATA_ELEMENTS.include?(name)
|
||||
if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata
|
||||
in_cdata = true
|
||||
elsif in_cdata
|
||||
serializeError(_("Unexpected child element of a CDATA element"))
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/serializer/htmlserializer'
|
||||
require 'html5/serializer/htmlserializer'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
class XHTMLSerializer < HTMLSerializer
|
||||
DEFAULTS = {
|
||||
|
@ -8,7 +8,8 @@ module HTML5lib
|
|||
:minimize_boolean_attributes => false,
|
||||
:use_trailing_solidus => true,
|
||||
:escape_lt_in_attrs => true,
|
||||
:omit_optional_tags => false
|
||||
:omit_optional_tags => false,
|
||||
:escape_rcdata => true
|
||||
}
|
||||
|
||||
def initialize(options={})
|
|
@ -1,7 +1,7 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5lib/inputstream'
|
||||
require 'html5/constants'
|
||||
require 'html5/inputstream'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
# This class takes care of tokenizing HTML.
|
||||
#
|
||||
|
@ -84,9 +84,9 @@ module HTML5lib
|
|||
# Start processing. When EOF is reached @state will return false
|
||||
# instead of true and the loop will terminate.
|
||||
while send @state
|
||||
while not @tokenQueue.empty?
|
||||
yield @tokenQueue.shift
|
||||
end
|
||||
yield :type => :ParseError, :data => @stream.errors.shift until
|
||||
@stream.errors.empty?
|
||||
yield @tokenQueue.shift until @tokenQueue.empty?
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -109,7 +109,7 @@ module HTML5lib
|
|||
|
||||
# The character we just consumed need to be put back on the stack so it
|
||||
# doesn't get lost...
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
end
|
||||
|
||||
# This function returns either U+FFFD or the character based on the
|
||||
|
@ -128,7 +128,6 @@ module HTML5lib
|
|||
radix = 16
|
||||
end
|
||||
|
||||
char = [0xFFFD].pack('U')
|
||||
charStack = []
|
||||
|
||||
# Consume all the characters that are in range while making sure we
|
||||
|
@ -142,17 +141,25 @@ module HTML5lib
|
|||
# Convert the set of characters consumed to an int.
|
||||
charAsInt = charStack.join('').to_i(radix)
|
||||
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
||||
# smaller) we need to do the "windows trick".
|
||||
if (127...160).include? charAsInt
|
||||
if charAsInt == 13
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Incorrect CR newline entity. Replaced with LF.")})
|
||||
charAsInt = 10
|
||||
elsif (128..159).include? charAsInt
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159
|
||||
# and smaller) we need to do the "windows trick".
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Entity used with illegal number (windows-1252 reference).")})
|
||||
|
||||
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
|
||||
end
|
||||
|
||||
if charAsInt > 0 and charAsInt <= 1114111
|
||||
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
|
||||
char = [charAsInt].pack('U')
|
||||
else
|
||||
char = [0xFFFD].pack('U')
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity represents an illegal codepoint.")})
|
||||
end
|
||||
|
||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||
|
@ -160,18 +167,18 @@ module HTML5lib
|
|||
if c != ";"
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity didn't end with ';'.")})
|
||||
@stream.queue.push(c)
|
||||
@stream.unget(c)
|
||||
end
|
||||
|
||||
return char
|
||||
end
|
||||
|
||||
def consumeEntity
|
||||
def consumeEntity(from_attribute=false)
|
||||
char = nil
|
||||
charStack = [@stream.char]
|
||||
if SPACE_CHARACTERS.include?(charStack[0]) or
|
||||
[:EOF, '<', '&'].include?(charStack[0])
|
||||
@stream.queue+= charStack
|
||||
@stream.unget(charStack)
|
||||
elsif charStack[0] == "#"
|
||||
# We might have a number entity here.
|
||||
charStack += [@stream.char, @stream.char]
|
||||
|
@ -179,22 +186,22 @@ module HTML5lib
|
|||
# If we reach the end of the file put everything up to :EOF
|
||||
# back in the queue
|
||||
charStack = charStack[0...charStack.index(:EOF)]
|
||||
@stream.queue+= charStack
|
||||
@stream.unget(charStack)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity expected. Got end of file instead.")})
|
||||
else
|
||||
if charStack[1].downcase == "x" \
|
||||
and HEX_DIGITS.include? charStack[2]
|
||||
# Hexadecimal entity detected.
|
||||
@stream.queue.push(charStack[2])
|
||||
@stream.unget(charStack[2])
|
||||
char = consumeNumberEntity(true)
|
||||
elsif DIGITS.include? charStack[1]
|
||||
# Decimal entity detected.
|
||||
@stream.queue += charStack[1..-1]
|
||||
@stream.unget(charStack[1..-1])
|
||||
char = consumeNumberEntity(false)
|
||||
else
|
||||
# No number entity detected.
|
||||
@stream.queue += charStack
|
||||
@stream.unget(charStack)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Numeric entity expected but none found.")})
|
||||
end
|
||||
|
@ -209,6 +216,8 @@ module HTML5lib
|
|||
filteredEntityList.reject! {|e| e[0].chr != charStack[0]}
|
||||
entityName = nil
|
||||
|
||||
# Try to find the longest entity the string will match to take care
|
||||
# of ¬i for instance.
|
||||
while charStack[-1] != :EOF
|
||||
name = charStack.join('')
|
||||
if filteredEntityList.any? {|e| e[0...name.length] == name}
|
||||
|
@ -220,6 +229,7 @@ module HTML5lib
|
|||
|
||||
if ENTITIES.include? name
|
||||
entityName = name
|
||||
break if entityName[-1] == ';'
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -228,15 +238,23 @@ module HTML5lib
|
|||
|
||||
# Check whether or not the last character returned can be
|
||||
# discarded or needs to be put back.
|
||||
if not charStack[-1] == ";"
|
||||
if entityName[-1] != ?;
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Named entity didn't end with ';'.")})
|
||||
@stream.queue += charStack[entityName.length..-1]
|
||||
end
|
||||
|
||||
if charStack[-1] != ";" and from_attribute and
|
||||
(ASCII_LETTERS.include?(charStack[entityName.length]) or
|
||||
DIGITS.include?(charStack[entityName.length]))
|
||||
@stream.unget(charStack)
|
||||
char = '&'
|
||||
else
|
||||
@stream.unget(charStack[entityName.length..-1])
|
||||
end
|
||||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Named entity expected. Got none.")})
|
||||
@stream.queue += charStack
|
||||
@stream.unget(charStack)
|
||||
end
|
||||
end
|
||||
return char
|
||||
|
@ -244,7 +262,7 @@ module HTML5lib
|
|||
|
||||
# This method replaces the need for "entityInAttributeValueState".
|
||||
def processEntityInAttribute
|
||||
entity = consumeEntity
|
||||
entity = consumeEntity(true)
|
||||
if entity
|
||||
@currentToken[:data][-1][1] += entity
|
||||
else
|
||||
|
@ -274,20 +292,23 @@ module HTML5lib
|
|||
@lastFourChars.shift if @lastFourChars.length > 4
|
||||
end
|
||||
|
||||
if data == "&" and [:PCDATA,:RCDATA].include?(@contentModelFlag)
|
||||
@state = @states[:entityData]
|
||||
if data == "&" and !@escapeFlag and
|
||||
[:PCDATA,:RCDATA].include?(@contentModelFlag)
|
||||
@state = @states[:entityData]
|
||||
|
||||
elsif data == "-" and [:CDATA,:RCDATA].include?(@contentModelFlag) and
|
||||
@escapeFlag == false and @lastFourChars.join('') == "<!--"
|
||||
elsif data == "-" and !@escapeFlag and
|
||||
[:CDATA,:RCDATA].include?(@contentModelFlag) and
|
||||
@lastFourChars.join('') == "<!--"
|
||||
@escapeFlag = true
|
||||
@tokenQueue.push({:type => :Characters, :data => data})
|
||||
|
||||
elsif data == "<" and @escapeFlag == false and
|
||||
elsif data == "<" and !@escapeFlag and
|
||||
[:PCDATA,:CDATA,:RCDATA].include?(@contentModelFlag)
|
||||
@state = @states[:tagOpen]
|
||||
|
||||
elsif data == ">" and [:CDATA,:RCDATA].include?(@contentModelFlag) and
|
||||
@escapeFlag == true and @lastFourChars[1..-1].join('') == "-->"
|
||||
elsif data == ">" and @escapeFlag and
|
||||
[:CDATA,:RCDATA].include?(@contentModelFlag) and
|
||||
@lastFourChars[1..-1].join('') == "-->"
|
||||
@escapeFlag = false
|
||||
@tokenQueue.push({:type => :Characters, :data => data})
|
||||
|
||||
|
@ -345,14 +366,14 @@ module HTML5lib
|
|||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected tag name. Got '?' instead (HTML doesn't " +
|
||||
"support processing instructions).")})
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
@state = @states[:bogusComment]
|
||||
else
|
||||
# XXX
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected tag name. Got something else instead")})
|
||||
@tokenQueue.push({:type => :Characters, :data => "<"})
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
@state = @states[:data]
|
||||
end
|
||||
else
|
||||
|
@ -363,7 +384,7 @@ module HTML5lib
|
|||
@state = @states[:closeTagOpen]
|
||||
else
|
||||
@tokenQueue.push({:type => :Characters, :data => "<"})
|
||||
@stream.queue.insert(0, data)
|
||||
@stream.unget(data)
|
||||
@state = @states[:data]
|
||||
end
|
||||
end
|
||||
|
@ -388,7 +409,7 @@ module HTML5lib
|
|||
|
||||
# Since this is just for checking. We put the characters back on
|
||||
# the stack.
|
||||
@stream.queue += charStack
|
||||
@stream.unget(charStack)
|
||||
end
|
||||
|
||||
if @currentToken and
|
||||
|
@ -426,7 +447,7 @@ module HTML5lib
|
|||
# XXX data can be _'_...
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected closing tag. Unexpected character '#{data}' found.")})
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
@state = @states[:bogusComment]
|
||||
end
|
||||
|
||||
|
@ -556,7 +577,7 @@ module HTML5lib
|
|||
@state = @states[:attributeValueDoubleQuoted]
|
||||
elsif data == "&"
|
||||
@state = @states[:attributeValueUnQuoted]
|
||||
@stream.queue.push(data);
|
||||
@stream.unget(data);
|
||||
elsif data == "'"
|
||||
@state = @states[:attributeValueSingleQuoted]
|
||||
elsif data == ">"
|
||||
|
@ -656,7 +677,7 @@ module HTML5lib
|
|||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
||||
@stream.queue += charStack
|
||||
@stream.unget(charStack)
|
||||
@state = @states[:bogusComment]
|
||||
end
|
||||
end
|
||||
|
@ -771,7 +792,7 @@ module HTML5lib
|
|||
else
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("No space after literal string 'DOCTYPE'.")})
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
@state = @states[:beforeDoctypeName]
|
||||
end
|
||||
return true
|
||||
|
@ -827,7 +848,7 @@ module HTML5lib
|
|||
@state = @states[:data]
|
||||
elsif data == :EOF
|
||||
@currentToken[:data] = true
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
@currentToken[:correct] = false
|
||||
|
@ -842,7 +863,7 @@ module HTML5lib
|
|||
elsif token == "system"
|
||||
@state = @states[:beforeDoctypeSystemIdentifier]
|
||||
else
|
||||
@stream.queue += charStack
|
||||
@stream.unget(charStack)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Expected 'public' or 'system'. Got '#{charStack.join('')}'")})
|
||||
@state = @states[:bogusDoctype]
|
||||
|
@ -1028,7 +1049,7 @@ module HTML5lib
|
|||
@state = @states[:data]
|
||||
elsif data == :EOF
|
||||
# XXX EMIT
|
||||
@stream.queue.push(data)
|
||||
@stream.unget(data)
|
||||
@tokenQueue.push({:type => :ParseError, :data =>
|
||||
_("Unexpected end of file in bogus doctype.")})
|
||||
@currentToken[:correct] = false
|
|
@ -1,17 +1,17 @@
|
|||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeBuilders
|
||||
|
||||
class << self
|
||||
def [](name)
|
||||
case name.to_s.downcase
|
||||
when 'simpletree' then
|
||||
require 'html5lib/treebuilders/simpletree'
|
||||
require 'html5/treebuilders/simpletree'
|
||||
SimpleTree::TreeBuilder
|
||||
when 'rexml' then
|
||||
require 'html5lib/treebuilders/rexml'
|
||||
require 'html5/treebuilders/rexml'
|
||||
REXML::TreeBuilder
|
||||
when 'hpricot' then
|
||||
require 'html5lib/treebuilders/hpricot'
|
||||
require 'html5/treebuilders/hpricot'
|
||||
Hpricot::TreeBuilder
|
||||
else
|
||||
raise "Unknown TreeBuilder #{name}"
|
|
@ -1,8 +1,8 @@
|
|||
require 'html5lib/constants'
|
||||
require 'html5/constants'
|
||||
|
||||
#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
|
||||
# The scope markers are inserted when entering buttons, object elements,
|
||||
# marquees, table cells, and table captions, and are used to prevent formatting
|
|
@ -1,221 +1,221 @@
|
|||
require 'html5lib/treebuilders/base'
|
||||
require 'rubygems'
|
||||
require 'hpricot'
|
||||
require 'forwardable'
|
||||
|
||||
module HTML5lib
|
||||
module TreeBuilders
|
||||
module Hpricot
|
||||
|
||||
class Node < Base::Node
|
||||
|
||||
extend Forwardable
|
||||
|
||||
def_delegators :@hpricot, :name
|
||||
|
||||
attr_accessor :hpricot
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
@hpricot = self.class.hpricot_class.new name
|
||||
end
|
||||
|
||||
def appendChild(node)
|
||||
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
|
||||
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
childNodes << node
|
||||
hpricot.children << node.hpricot
|
||||
end
|
||||
if (oldparent = node.hpricot.parent) != nil
|
||||
oldparent.children.delete_at(oldparent.children.index(node.hpricot))
|
||||
end
|
||||
node.hpricot.parent = hpricot
|
||||
node.parent = self
|
||||
end
|
||||
|
||||
def removeChild(node)
|
||||
childNodes.delete(node)
|
||||
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
|
||||
node.hpricot.parent = nil
|
||||
node.parent = nil
|
||||
end
|
||||
|
||||
def insertText(data, before=nil)
|
||||
if before
|
||||
insertBefore(TextNode.new(data), before)
|
||||
else
|
||||
appendChild(TextNode.new(data))
|
||||
end
|
||||
end
|
||||
|
||||
def insertBefore(node, refNode)
|
||||
index = childNodes.index(refNode)
|
||||
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
|
||||
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
|
||||
childNodes.insert(index, node)
|
||||
end
|
||||
end
|
||||
|
||||
def hasContent
|
||||
childNodes.any?
|
||||
end
|
||||
end
|
||||
|
||||
class Element < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Elem
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
|
||||
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
|
||||
end
|
||||
|
||||
def name
|
||||
@hpricot.stag.name
|
||||
end
|
||||
|
||||
def cloneNode
|
||||
attributes.inject(self.class.new(name)) do |node, (name, value)|
|
||||
node.hpricot[name] = value
|
||||
node
|
||||
end
|
||||
end
|
||||
|
||||
# A call to Hpricot::Elem#raw_attributes is built dynamically,
|
||||
# so alterations to the returned value (a hash) will be lost.
|
||||
#
|
||||
# AttributeProxy works around this by forwarding :[]= calls
|
||||
# to the raw_attributes accessor on the element start tag.
|
||||
#
|
||||
class AttributeProxy
|
||||
def initialize(hpricot)
|
||||
@hpricot = hpricot
|
||||
end
|
||||
|
||||
def []=(k, v)
|
||||
@hpricot.stag.send(stag_attributes_method)[k] = v
|
||||
end
|
||||
|
||||
def stag_attributes_method
|
||||
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
|
||||
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
|
||||
end
|
||||
|
||||
def method_missing(*a, &b)
|
||||
@hpricot.attributes.send(*a, &b)
|
||||
end
|
||||
end
|
||||
|
||||
def attributes
|
||||
AttributeProxy.new(@hpricot)
|
||||
end
|
||||
|
||||
def attributes=(attrs)
|
||||
attrs.each { |name, value| @hpricot[name] = value }
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
tree = "\n|#{' ' * indent}<#{name}>"
|
||||
indent += 2
|
||||
attributes.each do |name, value|
|
||||
next if name == 'xmlns'
|
||||
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
|
||||
end
|
||||
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
|
||||
end
|
||||
end
|
||||
|
||||
class Document < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Doc
|
||||
end
|
||||
|
||||
def initialize
|
||||
super(nil)
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentType < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::DocType
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
begin
|
||||
super(name)
|
||||
rescue ArgumentError # needs 3...
|
||||
end
|
||||
|
||||
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentFragment < Element
|
||||
def initialize
|
||||
super('')
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
|
||||
end
|
||||
end
|
||||
|
||||
class TextNode < Node
|
||||
def initialize(data)
|
||||
@hpricot = ::Hpricot::Text.new(data)
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}\"#{hpricot.content}\""
|
||||
end
|
||||
end
|
||||
|
||||
class CommentNode < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Comment
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
|
||||
end
|
||||
end
|
||||
|
||||
class TreeBuilder < Base::TreeBuilder
|
||||
def initialize
|
||||
@documentClass = Document
|
||||
@doctypeClass = DocumentType
|
||||
@elementClass = Element
|
||||
@commentClass = CommentNode
|
||||
@fragmentClass = DocumentFragment
|
||||
end
|
||||
|
||||
def testSerializer(node)
|
||||
node.printTree
|
||||
end
|
||||
|
||||
def getDocument
|
||||
@document.hpricot
|
||||
end
|
||||
|
||||
def getFragment
|
||||
@document = super
|
||||
return @document.hpricot.children
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
||||
require 'html5/treebuilders/base'
|
||||
require 'rubygems'
|
||||
require 'hpricot'
|
||||
require 'forwardable'
|
||||
|
||||
module HTML5
|
||||
module TreeBuilders
|
||||
module Hpricot
|
||||
|
||||
class Node < Base::Node
|
||||
|
||||
extend Forwardable
|
||||
|
||||
def_delegators :@hpricot, :name
|
||||
|
||||
attr_accessor :hpricot
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
@hpricot = self.class.hpricot_class.new name
|
||||
end
|
||||
|
||||
def appendChild(node)
|
||||
if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
|
||||
childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
childNodes << node
|
||||
hpricot.children << node.hpricot
|
||||
end
|
||||
if (oldparent = node.hpricot.parent) != nil
|
||||
oldparent.children.delete_at(oldparent.children.index(node.hpricot))
|
||||
end
|
||||
node.hpricot.parent = hpricot
|
||||
node.parent = self
|
||||
end
|
||||
|
||||
def removeChild(node)
|
||||
childNodes.delete(node)
|
||||
hpricot.children.delete_at(hpricot.children.index(node.hpricot))
|
||||
node.hpricot.parent = nil
|
||||
node.parent = nil
|
||||
end
|
||||
|
||||
def insertText(data, before=nil)
|
||||
if before
|
||||
insertBefore(TextNode.new(data), before)
|
||||
else
|
||||
appendChild(TextNode.new(data))
|
||||
end
|
||||
end
|
||||
|
||||
def insertBefore(node, refNode)
|
||||
index = childNodes.index(refNode)
|
||||
if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
|
||||
childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
|
||||
else
|
||||
refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot)
|
||||
childNodes.insert(index, node)
|
||||
end
|
||||
end
|
||||
|
||||
def hasContent
|
||||
childNodes.any?
|
||||
end
|
||||
end
|
||||
|
||||
class Element < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Elem
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
super(name)
|
||||
|
||||
@hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
|
||||
end
|
||||
|
||||
def name
|
||||
@hpricot.stag.name
|
||||
end
|
||||
|
||||
def cloneNode
|
||||
attributes.inject(self.class.new(name)) do |node, (name, value)|
|
||||
node.hpricot[name] = value
|
||||
node
|
||||
end
|
||||
end
|
||||
|
||||
# A call to Hpricot::Elem#raw_attributes is built dynamically,
|
||||
# so alterations to the returned value (a hash) will be lost.
|
||||
#
|
||||
# AttributeProxy works around this by forwarding :[]= calls
|
||||
# to the raw_attributes accessor on the element start tag.
|
||||
#
|
||||
class AttributeProxy
|
||||
def initialize(hpricot)
|
||||
@hpricot = hpricot
|
||||
end
|
||||
|
||||
def []=(k, v)
|
||||
@hpricot.stag.send(stag_attributes_method)[k] = v
|
||||
end
|
||||
|
||||
def stag_attributes_method
|
||||
# STag#attributes changed to STag#raw_attributes after Hpricot 0.5
|
||||
@hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
|
||||
end
|
||||
|
||||
def method_missing(*a, &b)
|
||||
@hpricot.attributes.send(*a, &b)
|
||||
end
|
||||
end
|
||||
|
||||
def attributes
|
||||
AttributeProxy.new(@hpricot)
|
||||
end
|
||||
|
||||
def attributes=(attrs)
|
||||
attrs.each { |name, value| @hpricot[name] = value }
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
tree = "\n|#{' ' * indent}<#{name}>"
|
||||
indent += 2
|
||||
attributes.each do |name, value|
|
||||
next if name == 'xmlns'
|
||||
tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
|
||||
end
|
||||
childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
|
||||
end
|
||||
end
|
||||
|
||||
class Document < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Doc
|
||||
end
|
||||
|
||||
def initialize
|
||||
super(nil)
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentType < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::DocType
|
||||
end
|
||||
|
||||
def initialize(name)
|
||||
begin
|
||||
super(name)
|
||||
rescue ArgumentError # needs 3...
|
||||
end
|
||||
|
||||
@hpricot = ::Hpricot::DocType.new(name, nil, nil)
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
|
||||
end
|
||||
end
|
||||
|
||||
class DocumentFragment < Element
|
||||
def initialize
|
||||
super('')
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
|
||||
end
|
||||
end
|
||||
|
||||
class TextNode < Node
|
||||
def initialize(data)
|
||||
@hpricot = ::Hpricot::Text.new(data)
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}\"#{hpricot.content}\""
|
||||
end
|
||||
end
|
||||
|
||||
class CommentNode < Node
|
||||
def self.hpricot_class
|
||||
::Hpricot::Comment
|
||||
end
|
||||
|
||||
def printTree(indent=0)
|
||||
"\n|#{' ' * indent}<!-- #{hpricot.content} -->"
|
||||
end
|
||||
end
|
||||
|
||||
class TreeBuilder < Base::TreeBuilder
|
||||
def initialize
|
||||
@documentClass = Document
|
||||
@doctypeClass = DocumentType
|
||||
@elementClass = Element
|
||||
@commentClass = CommentNode
|
||||
@fragmentClass = DocumentFragment
|
||||
end
|
||||
|
||||
def testSerializer(node)
|
||||
node.printTree
|
||||
end
|
||||
|
||||
def getDocument
|
||||
@document.hpricot
|
||||
end
|
||||
|
||||
def getFragment
|
||||
@document = super
|
||||
return @document.hpricot.children
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
|
@ -1,8 +1,8 @@
|
|||
require 'html5lib/treebuilders/base'
|
||||
require 'html5/treebuilders/base'
|
||||
require 'rexml/document'
|
||||
require 'forwardable'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeBuilders
|
||||
module REXML
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
require 'html5lib/treebuilders/base'
|
||||
require 'html5/treebuilders/base'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeBuilders
|
||||
module SimpleTree
|
||||
|
|
@ -1,19 +1,19 @@
|
|||
require 'html5lib/treewalkers/base'
|
||||
require 'html5/treewalkers/base'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeWalkers
|
||||
|
||||
class << self
|
||||
def [](name)
|
||||
case name.to_s.downcase
|
||||
when 'simpletree' then
|
||||
require 'html5lib/treewalkers/simpletree'
|
||||
require 'html5/treewalkers/simpletree'
|
||||
SimpleTree::TreeWalker
|
||||
when 'rexml' then
|
||||
require 'html5lib/treewalkers/rexml'
|
||||
require 'html5/treewalkers/rexml'
|
||||
REXML::TreeWalker
|
||||
when 'hpricot' then
|
||||
require 'html5lib/treewalkers/hpricot'
|
||||
require 'html5/treewalkers/hpricot'
|
||||
Hpricot::TreeWalker
|
||||
else
|
||||
raise "Unknown TreeWalker #{name}"
|
|
@ -1,5 +1,5 @@
|
|||
require 'html5lib/constants'
|
||||
module HTML5lib
|
||||
require 'html5/constants'
|
||||
module HTML5
|
||||
module TreeWalkers
|
||||
|
||||
module TokenConstructor
|
|
@ -1,10 +1,10 @@
|
|||
require 'html5lib/treewalkers/base'
|
||||
require 'html5/treewalkers/base'
|
||||
require 'rexml/document'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeWalkers
|
||||
module Hpricot
|
||||
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
|
||||
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
|
||||
|
||||
def node_details(node)
|
||||
case node
|
|
@ -1,10 +1,10 @@
|
|||
require 'html5lib/treewalkers/base'
|
||||
require 'html5/treewalkers/base'
|
||||
require 'rexml/document'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeWalkers
|
||||
module REXML
|
||||
class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker
|
||||
class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker
|
||||
|
||||
def node_details(node)
|
||||
case node
|
|
@ -1,10 +1,10 @@
|
|||
require 'html5lib/treewalkers/base'
|
||||
require 'html5/treewalkers/base'
|
||||
|
||||
module HTML5lib
|
||||
module HTML5
|
||||
module TreeWalkers
|
||||
module SimpleTree
|
||||
class TreeWalker < HTML5lib::TreeWalkers::Base
|
||||
include HTML5lib::TreeBuilders::SimpleTree
|
||||
class TreeWalker < HTML5::TreeWalkers::Base
|
||||
include HTML5::TreeBuilders::SimpleTree
|
||||
|
||||
def walk(node)
|
||||
case node
|
708
vendor/plugins/HTML5lib/lib/html5lib/constants.rb
vendored
708
vendor/plugins/HTML5lib/lib/html5lib/constants.rb
vendored
|
@ -1,708 +0,0 @@
|
|||
module HTML5lib
|
||||
|
||||
class EOF < Exception; end
|
||||
|
||||
CONTENT_MODEL_FLAGS = [
|
||||
:PCDATA,
|
||||
:RCDATA,
|
||||
:CDATA,
|
||||
:PLAINTEXT
|
||||
]
|
||||
|
||||
SCOPING_ELEMENTS = %w[
|
||||
button
|
||||
caption
|
||||
html
|
||||
marquee
|
||||
object
|
||||
table
|
||||
td
|
||||
th
|
||||
]
|
||||
|
||||
FORMATTING_ELEMENTS = %w[
|
||||
a
|
||||
b
|
||||
big
|
||||
em
|
||||
font
|
||||
i
|
||||
nobr
|
||||
s
|
||||
small
|
||||
strike
|
||||
strong
|
||||
tt
|
||||
u
|
||||
]
|
||||
|
||||
SPECIAL_ELEMENTS = %w[
|
||||
address
|
||||
area
|
||||
base
|
||||
basefont
|
||||
bgsound
|
||||
blockquote
|
||||
body
|
||||
br
|
||||
center
|
||||
col
|
||||
colgroup
|
||||
dd
|
||||
dir
|
||||
div
|
||||
dl
|
||||
dt
|
||||
embed
|
||||
fieldset
|
||||
form
|
||||
frame
|
||||
frameset
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
h4
|
||||
h5
|
||||
h6
|
||||
head
|
||||
hr
|
||||
iframe
|
||||
image
|
||||
img
|
||||
input
|
||||
isindex
|
||||
li
|
||||
link
|
||||
listing
|
||||
menu
|
||||
meta
|
||||
noembed
|
||||
noframes
|
||||
noscript
|
||||
ol
|
||||
optgroup
|
||||
option
|
||||
p
|
||||
param
|
||||
plaintext
|
||||
pre
|
||||
script
|
||||
select
|
||||
spacer
|
||||
style
|
||||
tbody
|
||||
textarea
|
||||
tfoot
|
||||
thead
|
||||
title
|
||||
tr
|
||||
ul
|
||||
wbr
|
||||
]
|
||||
|
||||
SPACE_CHARACTERS = %W[
|
||||
\t
|
||||
\n
|
||||
\x0B
|
||||
\x0C
|
||||
\x20
|
||||
\r
|
||||
]
|
||||
|
||||
TABLE_INSERT_MODE_ELEMENTS = %w[
|
||||
table
|
||||
tbody
|
||||
tfoot
|
||||
thead
|
||||
tr
|
||||
]
|
||||
|
||||
ASCII_LOWERCASE = ('a'..'z').to_a.join('')
|
||||
ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
|
||||
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
|
||||
DIGITS = '0'..'9'
|
||||
HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
|
||||
|
||||
# Heading elements need to be ordered
|
||||
HEADING_ELEMENTS = %w[
|
||||
h1
|
||||
h2
|
||||
h3
|
||||
h4
|
||||
h5
|
||||
h6
|
||||
]
|
||||
|
||||
# XXX What about event-source and command?
|
||||
VOID_ELEMENTS = %w[
|
||||
base
|
||||
link
|
||||
meta
|
||||
hr
|
||||
br
|
||||
img
|
||||
embed
|
||||
param
|
||||
area
|
||||
col
|
||||
input
|
||||
]
|
||||
|
||||
CDATA_ELEMENTS = %w[title textarea]
|
||||
|
||||
RCDATA_ELEMENTS = %w[
|
||||
style
|
||||
script
|
||||
xmp
|
||||
iframe
|
||||
noembed
|
||||
noframes
|
||||
noscript
|
||||
]
|
||||
|
||||
BOOLEAN_ATTRIBUTES = {
|
||||
:global => %w[irrelevant],
|
||||
'style' => %w[scoped],
|
||||
'img' => %w[ismap],
|
||||
'audio' => %w[autoplay controls],
|
||||
'video' => %w[autoplay controls],
|
||||
'script' => %w[defer async],
|
||||
'details' => %w[open],
|
||||
'datagrid' => %w[multiple disabled],
|
||||
'command' => %w[hidden disabled checked default],
|
||||
'menu' => %w[autosubmit],
|
||||
'fieldset' => %w[disabled readonly],
|
||||
'option' => %w[disabled readonly selected],
|
||||
'optgroup' => %w[disabled readonly],
|
||||
'button' => %w[disabled autofocus],
|
||||
'input' => %w[disabled readonly required autofocus checked ismap],
|
||||
'select' => %w[disabled readonly autofocus multiple],
|
||||
'output' => %w[disabled readonly]
|
||||
}
|
||||
|
||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index.
|
||||
ENTITIES_WINDOWS1252 = [
|
||||
8364, # 0x80 0x20AC EURO SIGN
|
||||
65533, # 0x81 UNDEFINED
|
||||
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
||||
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
||||
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
||||
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
||||
8224, # 0x86 0x2020 DAGGER
|
||||
8225, # 0x87 0x2021 DOUBLE DAGGER
|
||||
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||
8240, # 0x89 0x2030 PER MILLE SIGN
|
||||
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
||||
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
||||
65533, # 0x8D UNDEFINED
|
||||
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
||||
65533, # 0x8F UNDEFINED
|
||||
65533, # 0x90 UNDEFINED
|
||||
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
||||
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
||||
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
||||
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
||||
8226, # 0x95 0x2022 BULLET
|
||||
8211, # 0x96 0x2013 EN DASH
|
||||
8212, # 0x97 0x2014 EM DASH
|
||||
732, # 0x98 0x02DC SMALL TILDE
|
||||
8482, # 0x99 0x2122 TRADE MARK SIGN
|
||||
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
||||
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
||||
65533, # 0x9D UNDEFINED
|
||||
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
||||
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
]
|
||||
|
||||
private
|
||||
|
||||
def self.U n
|
||||
[n].pack('U')
|
||||
end
|
||||
|
||||
public
|
||||
|
||||
ENTITIES = {
|
||||
"AElig" => U(0xC6),
|
||||
"Aacute" => U(0xC1),
|
||||
"Acirc" => U(0xC2),
|
||||
"Agrave" => U(0xC0),
|
||||
"Alpha" => U(0x0391),
|
||||
"Aring" => U(0xC5),
|
||||
"Atilde" => U(0xC3),
|
||||
"Auml" => U(0xC4),
|
||||
"Beta" => U(0x0392),
|
||||
"Ccedil" => U(0xC7),
|
||||
"Chi" => U(0x03A7),
|
||||
"Dagger" => U(0x2021),
|
||||
"Delta" => U(0x0394),
|
||||
"ETH" => U(0xD0),
|
||||
"Eacute" => U(0xC9),
|
||||
"Ecirc" => U(0xCA),
|
||||
"Egrave" => U(0xC8),
|
||||
"Epsilon" => U(0x0395),
|
||||
"Eta" => U(0x0397),
|
||||
"Euml" => U(0xCB),
|
||||
"Gamma" => U(0x0393),
|
||||
"Iacute" => U(0xCD),
|
||||
"Icirc" => U(0xCE),
|
||||
"Igrave" => U(0xCC),
|
||||
"Iota" => U(0x0399),
|
||||
"Iuml" => U(0xCF),
|
||||
"Kappa" => U(0x039A),
|
||||
"Lambda" => U(0x039B),
|
||||
"Mu" => U(0x039C),
|
||||
"Ntilde" => U(0xD1),
|
||||
"Nu" => U(0x039D),
|
||||
"OElig" => U(0x0152),
|
||||
"Oacute" => U(0xD3),
|
||||
"Ocirc" => U(0xD4),
|
||||
"Ograve" => U(0xD2),
|
||||
"Omega" => U(0x03A9),
|
||||
"Omicron" => U(0x039F),
|
||||
"Oslash" => U(0xD8),
|
||||
"Otilde" => U(0xD5),
|
||||
"Ouml" => U(0xD6),
|
||||
"Phi" => U(0x03A6),
|
||||
"Pi" => U(0x03A0),
|
||||
"Prime" => U(0x2033),
|
||||
"Psi" => U(0x03A8),
|
||||
"Rho" => U(0x03A1),
|
||||
"Scaron" => U(0x0160),
|
||||
"Sigma" => U(0x03A3),
|
||||
"THORN" => U(0xDE),
|
||||
"Tau" => U(0x03A4),
|
||||
"Theta" => U(0x0398),
|
||||
"Uacute" => U(0xDA),
|
||||
"Ucirc" => U(0xDB),
|
||||
"Ugrave" => U(0xD9),
|
||||
"Upsilon" => U(0x03A5),
|
||||
"Uuml" => U(0xDC),
|
||||
"Xi" => U(0x039E),
|
||||
"Yacute" => U(0xDD),
|
||||
"Yuml" => U(0x0178),
|
||||
"Zeta" => U(0x0396),
|
||||
"aacute" => U(0xE1),
|
||||
"acirc" => U(0xE2),
|
||||
"acute" => U(0xB4),
|
||||
"aelig" => U(0xE6),
|
||||
"agrave" => U(0xE0),
|
||||
"alefsym" => U(0x2135),
|
||||
"alpha" => U(0x03B1),
|
||||
"amp" => U(0x26),
|
||||
"AMP" => U(0x26),
|
||||
"and" => U(0x2227),
|
||||
"ang" => U(0x2220),
|
||||
"apos" => U(0x27),
|
||||
"aring" => U(0xE5),
|
||||
"asymp" => U(0x2248),
|
||||
"atilde" => U(0xE3),
|
||||
"auml" => U(0xE4),
|
||||
"bdquo" => U(0x201E),
|
||||
"beta" => U(0x03B2),
|
||||
"brvbar" => U(0xA6),
|
||||
"bull" => U(0x2022),
|
||||
"cap" => U(0x2229),
|
||||
"ccedil" => U(0xE7),
|
||||
"cedil" => U(0xB8),
|
||||
"cent" => U(0xA2),
|
||||
"chi" => U(0x03C7),
|
||||
"circ" => U(0x02C6),
|
||||
"clubs" => U(0x2663),
|
||||
"cong" => U(0x2245),
|
||||
"copy" => U(0xA9),
|
||||
"COPY" => U(0xA9),
|
||||
"crarr" => U(0x21B5),
|
||||
"cup" => U(0x222A),
|
||||
"curren" => U(0xA4),
|
||||
"dArr" => U(0x21D3),
|
||||
"dagger" => U(0x2020),
|
||||
"darr" => U(0x2193),
|
||||
"deg" => U(0xB0),
|
||||
"delta" => U(0x03B4),
|
||||
"diams" => U(0x2666),
|
||||
"divide" => U(0xF7),
|
||||
"eacute" => U(0xE9),
|
||||
"ecirc" => U(0xEA),
|
||||
"egrave" => U(0xE8),
|
||||
"empty" => U(0x2205),
|
||||
"emsp" => U(0x2003),
|
||||
"ensp" => U(0x2002),
|
||||
"epsilon" => U(0x03B5),
|
||||
"equiv" => U(0x2261),
|
||||
"eta" => U(0x03B7),
|
||||
"eth" => U(0xF0),
|
||||
"euml" => U(0xEB),
|
||||
"euro" => U(0x20AC),
|
||||
"exist" => U(0x2203),
|
||||
"fnof" => U(0x0192),
|
||||
"forall" => U(0x2200),
|
||||
"frac12" => U(0xBD),
|
||||
"frac14" => U(0xBC),
|
||||
"frac34" => U(0xBE),
|
||||
"frasl" => U(0x2044),
|
||||
"gamma" => U(0x03B3),
|
||||
"ge" => U(0x2265),
|
||||
"gt" => U(0x3E),
|
||||
"GT" => U(0x3E),
|
||||
"hArr" => U(0x21D4),
|
||||
"harr" => U(0x2194),
|
||||
"hearts" => U(0x2665),
|
||||
"hellip" => U(0x2026),
|
||||
"iacute" => U(0xED),
|
||||
"icirc" => U(0xEE),
|
||||
"iexcl" => U(0xA1),
|
||||
"igrave" => U(0xEC),
|
||||
"image" => U(0x2111),
|
||||
"infin" => U(0x221E),
|
||||
"int" => U(0x222B),
|
||||
"iota" => U(0x03B9),
|
||||
"iquest" => U(0xBF),
|
||||
"isin" => U(0x2208),
|
||||
"iuml" => U(0xEF),
|
||||
"kappa" => U(0x03BA),
|
||||
"lArr" => U(0x21D0),
|
||||
"lambda" => U(0x03BB),
|
||||
"lang" => U(0x2329),
|
||||
"laquo" => U(0xAB),
|
||||
"larr" => U(0x2190),
|
||||
"lceil" => U(0x2308),
|
||||
"ldquo" => U(0x201C),
|
||||
"le" => U(0x2264),
|
||||
"lfloor" => U(0x230A),
|
||||
"lowast" => U(0x2217),
|
||||
"loz" => U(0x25CA),
|
||||
"lrm" => U(0x200E),
|
||||
"lsaquo" => U(0x2039),
|
||||
"lsquo" => U(0x2018),
|
||||
"lt" => U(0x3C),
|
||||
"LT" => U(0x3C),
|
||||
"macr" => U(0xAF),
|
||||
"mdash" => U(0x2014),
|
||||
"micro" => U(0xB5),
|
||||
"middot" => U(0xB7),
|
||||
"minus" => U(0x2212),
|
||||
"mu" => U(0x03BC),
|
||||
"nabla" => U(0x2207),
|
||||
"nbsp" => U(0xA0),
|
||||
"ndash" => U(0x2013),
|
||||
"ne" => U(0x2260),
|
||||
"ni" => U(0x220B),
|
||||
"not" => U(0xAC),
|
||||
"notin" => U(0x2209),
|
||||
"nsub" => U(0x2284),
|
||||
"ntilde" => U(0xF1),
|
||||
"nu" => U(0x03BD),
|
||||
"oacute" => U(0xF3),
|
||||
"ocirc" => U(0xF4),
|
||||
"oelig" => U(0x0153),
|
||||
"ograve" => U(0xF2),
|
||||
"oline" => U(0x203E),
|
||||
"omega" => U(0x03C9),
|
||||
"omicron" => U(0x03BF),
|
||||
"oplus" => U(0x2295),
|
||||
"or" => U(0x2228),
|
||||
"ordf" => U(0xAA),
|
||||
"ordm" => U(0xBA),
|
||||
"oslash" => U(0xF8),
|
||||
"otilde" => U(0xF5),
|
||||
"otimes" => U(0x2297),
|
||||
"ouml" => U(0xF6),
|
||||
"para" => U(0xB6),
|
||||
"part" => U(0x2202),
|
||||
"permil" => U(0x2030),
|
||||
"perp" => U(0x22A5),
|
||||
"phi" => U(0x03C6),
|
||||
"pi" => U(0x03C0),
|
||||
"piv" => U(0x03D6),
|
||||
"plusmn" => U(0xB1),
|
||||
"pound" => U(0xA3),
|
||||
"prime" => U(0x2032),
|
||||
"prod" => U(0x220F),
|
||||
"prop" => U(0x221D),
|
||||
"psi" => U(0x03C8),
|
||||
"quot" => U(0x22),
|
||||
"QUOT" => U(0x22),
|
||||
"rArr" => U(0x21D2),
|
||||
"radic" => U(0x221A),
|
||||
"rang" => U(0x232A),
|
||||
"raquo" => U(0xBB),
|
||||
"rarr" => U(0x2192),
|
||||
"rceil" => U(0x2309),
|
||||
"rdquo" => U(0x201D),
|
||||
"real" => U(0x211C),
|
||||
"reg" => U(0xAE),
|
||||
"REG" => U(0xAE),
|
||||
"rfloor" => U(0x230B),
|
||||
"rho" => U(0x03C1),
|
||||
"rlm" => U(0x200F),
|
||||
"rsaquo" => U(0x203A),
|
||||
"rsquo" => U(0x2019),
|
||||
"sbquo" => U(0x201A),
|
||||
"scaron" => U(0x0161),
|
||||
"sdot" => U(0x22C5),
|
||||
"sect" => U(0xA7),
|
||||
"shy" => U(0xAD),
|
||||
"sigma" => U(0x03C3),
|
||||
"sigmaf" => U(0x03C2),
|
||||
"sim" => U(0x223C),
|
||||
"spades" => U(0x2660),
|
||||
"sub" => U(0x2282),
|
||||
"sube" => U(0x2286),
|
||||
"sum" => U(0x2211),
|
||||
"sup" => U(0x2283),
|
||||
"sup1" => U(0xB9),
|
||||
"sup2" => U(0xB2),
|
||||
"sup3" => U(0xB3),
|
||||
"supe" => U(0x2287),
|
||||
"szlig" => U(0xDF),
|
||||
"tau" => U(0x03C4),
|
||||
"there4" => U(0x2234),
|
||||
"theta" => U(0x03B8),
|
||||
"thetasym" => U(0x03D1),
|
||||
"thinsp" => U(0x2009),
|
||||
"thorn" => U(0xFE),
|
||||
"tilde" => U(0x02DC),
|
||||
"times" => U(0xD7),
|
||||
"trade" => U(0x2122),
|
||||
"uArr" => U(0x21D1),
|
||||
"uacute" => U(0xFA),
|
||||
"uarr" => U(0x2191),
|
||||
"ucirc" => U(0xFB),
|
||||
"ugrave" => U(0xF9),
|
||||
"uml" => U(0xA8),
|
||||
"upsih" => U(0x03D2),
|
||||
"upsilon" => U(0x03C5),
|
||||
"uuml" => U(0xFC),
|
||||
"weierp" => U(0x2118),
|
||||
"xi" => U(0x03BE),
|
||||
"yacute" => U(0xFD),
|
||||
"yen" => U(0xA5),
|
||||
"yuml" => U(0xFF),
|
||||
"zeta" => U(0x03B6),
|
||||
"zwj" => U(0x200D),
|
||||
"zwnj" => U(0x200C)
|
||||
}
|
||||
|
||||
ENCODINGS = %w[
|
||||
ansi_x3.4-1968
|
||||
iso-ir-6
|
||||
ansi_x3.4-1986
|
||||
iso_646.irv:1991
|
||||
ascii
|
||||
iso646-us
|
||||
us-ascii
|
||||
us
|
||||
ibm367
|
||||
cp367
|
||||
csascii
|
||||
ks_c_5601-1987
|
||||
korean
|
||||
iso-2022-kr
|
||||
csiso2022kr
|
||||
euc-kr
|
||||
iso-2022-jp
|
||||
csiso2022jp
|
||||
iso-2022-jp-2
|
||||
iso-ir-58
|
||||
chinese
|
||||
csiso58gb231280
|
||||
iso_8859-1:1987
|
||||
iso-ir-100
|
||||
iso_8859-1
|
||||
iso-8859-1
|
||||
latin1
|
||||
l1
|
||||
ibm819
|
||||
cp819
|
||||
csisolatin1
|
||||
iso_8859-2:1987
|
||||
iso-ir-101
|
||||
iso_8859-2
|
||||
iso-8859-2
|
||||
latin2
|
||||
l2
|
||||
csisolatin2
|
||||
iso_8859-3:1988
|
||||
iso-ir-109
|
||||
iso_8859-3
|
||||
iso-8859-3
|
||||
latin3
|
||||
l3
|
||||
csisolatin3
|
||||
iso_8859-4:1988
|
||||
iso-ir-110
|
||||
iso_8859-4
|
||||
iso-8859-4
|
||||
latin4
|
||||
l4
|
||||
csisolatin4
|
||||
iso_8859-6:1987
|
||||
iso-ir-127
|
||||
iso_8859-6
|
||||
iso-8859-6
|
||||
ecma-114
|
||||
asmo-708
|
||||
arabic
|
||||
csisolatinarabic
|
||||
iso_8859-7:1987
|
||||
iso-ir-126
|
||||
iso_8859-7
|
||||
iso-8859-7
|
||||
elot_928
|
||||
ecma-118
|
||||
greek
|
||||
greek8
|
||||
csisolatingreek
|
||||
iso_8859-8:1988
|
||||
iso-ir-138
|
||||
iso_8859-8
|
||||
iso-8859-8
|
||||
hebrew
|
||||
csisolatinhebrew
|
||||
iso_8859-5:1988
|
||||
iso-ir-144
|
||||
iso_8859-5
|
||||
iso-8859-5
|
||||
cyrillic
|
||||
csisolatincyrillic
|
||||
iso_8859-9:1989
|
||||
iso-ir-148
|
||||
iso_8859-9
|
||||
iso-8859-9
|
||||
latin5
|
||||
l5
|
||||
csisolatin5
|
||||
iso-8859-10
|
||||
iso-ir-157
|
||||
l6
|
||||
iso_8859-10:1992
|
||||
csisolatin6
|
||||
latin6
|
||||
hp-roman8
|
||||
roman8
|
||||
r8
|
||||
ibm037
|
||||
cp037
|
||||
csibm037
|
||||
ibm424
|
||||
cp424
|
||||
csibm424
|
||||
ibm437
|
||||
cp437
|
||||
437
|
||||
cspc8codepage437
|
||||
ibm500
|
||||
cp500
|
||||
csibm500
|
||||
ibm775
|
||||
cp775
|
||||
cspc775baltic
|
||||
ibm850
|
||||
cp850
|
||||
850
|
||||
cspc850multilingual
|
||||
ibm852
|
||||
cp852
|
||||
852
|
||||
cspcp852
|
||||
ibm855
|
||||
cp855
|
||||
855
|
||||
csibm855
|
||||
ibm857
|
||||
cp857
|
||||
857
|
||||
csibm857
|
||||
ibm860
|
||||
cp860
|
||||
860
|
||||
csibm860
|
||||
ibm861
|
||||
cp861
|
||||
861
|
||||
cp-is
|
||||
csibm861
|
||||
ibm862
|
||||
cp862
|
||||
862
|
||||
cspc862latinhebrew
|
||||
ibm863
|
||||
cp863
|
||||
863
|
||||
csibm863
|
||||
ibm864
|
||||
cp864
|
||||
csibm864
|
||||
ibm865
|
||||
cp865
|
||||
865
|
||||
csibm865
|
||||
ibm866
|
||||
cp866
|
||||
866
|
||||
csibm866
|
||||
ibm869
|
||||
cp869
|
||||
869
|
||||
cp-gr
|
||||
csibm869
|
||||
ibm1026
|
||||
cp1026
|
||||
csibm1026
|
||||
koi8-r
|
||||
cskoi8r
|
||||
koi8-u
|
||||
big5-hkscs
|
||||
ptcp154
|
||||
csptcp154
|
||||
pt154
|
||||
cp154
|
||||
utf-7
|
||||
utf-16be
|
||||
utf-16le
|
||||
utf-16
|
||||
utf-8
|
||||
iso-8859-13
|
||||
iso-8859-14
|
||||
iso-ir-199
|
||||
iso_8859-14:1998
|
||||
iso_8859-14
|
||||
latin8
|
||||
iso-celtic
|
||||
l8
|
||||
iso-8859-15
|
||||
iso_8859-15
|
||||
iso-8859-16
|
||||
iso-ir-226
|
||||
iso_8859-16:2001
|
||||
iso_8859-16
|
||||
latin10
|
||||
l10
|
||||
gbk
|
||||
cp936
|
||||
ms936
|
||||
gb18030
|
||||
shift_jis
|
||||
ms_kanji
|
||||
csshiftjis
|
||||
euc-jp
|
||||
gb2312
|
||||
big5
|
||||
csbig5
|
||||
windows-1250
|
||||
windows-1251
|
||||
windows-1252
|
||||
windows-1253
|
||||
windows-1254
|
||||
windows-1255
|
||||
windows-1256
|
||||
windows-1257
|
||||
windows-1258
|
||||
tis-620
|
||||
hz-gb-2312
|
||||
]
|
||||
|
||||
end
|
|
@ -1 +0,0 @@
|
|||
require 'html5lib/filters/optionaltags'
|
|
@ -1,2 +0,0 @@
|
|||
require 'html5lib/serializer/htmlserializer'
|
||||
require 'html5lib/serializer/xhtmlserializer'
|
Loading…
Add table
Add a link
Reference in a new issue