diff --git a/vendor/plugins/HTML5lib/bin/html5 b/vendor/plugins/HTML5lib/bin/html5 index 2680aea3..bc0514ad 100755 --- a/vendor/plugins/HTML5lib/bin/html5 +++ b/vendor/plugins/HTML5lib/bin/html5 @@ -81,8 +81,8 @@ def print_output(parser, document, opts) if opts.error errList=[] - for pos, message in parser.errors - errList << ("Line %i Col %i"%pos + " " + message) + for pos, errorcode, datavars in parser.errors + errList << "Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars end $stdout.write("\nParse errors:\n" + errList.join("\n")+"\n") end diff --git a/vendor/plugins/HTML5lib/lib/html5/constants.rb b/vendor/plugins/HTML5lib/lib/html5/constants.rb index 8ccaf66d..9a4580fa 100755 --- a/vendor/plugins/HTML5lib/lib/html5/constants.rb +++ b/vendor/plugins/HTML5lib/lib/html5/constants.rb @@ -2,6 +2,8 @@ module HTML5 class EOF < Exception; end + def self._(str); str end + CONTENT_MODEL_FLAGS = [ :PCDATA, :RCDATA, @@ -815,4 +817,228 @@ module HTML5 hz-gb-2312 ] + E = { + "null-character" => + _("Null character in input stream, replaced with U+FFFD."), + "incorrectly-placed-solidus" => + _("Solidus (/) incorrectly placed in tag."), + "incorrect-cr-newline-entity" => + _("Incorrect CR newline entity, replaced with LF."), + "illegal-windows-1252-entity" => + _("Entity used with illegal number (windows-1252 reference)."), + "cant-convert-numeric-entity" => + _("Numeric entity couldn't be converted to character " + + "(codepoint U+%(charAsInt)08x)."), + "illegal-codepoint-for-numeric-entity" => + _("Numeric entity represents an illegal codepoint=> " + + "U+%(charAsInt)08x."), + "numeric-entity-without-semicolon" => + _("Numeric entity didn't end with ';'."), + "expected-numeric-entity-but-got-eof" => + _("Numeric entity expected. Got end of file instead."), + "expected-numeric-entity" => + _("Numeric entity expected but none found."), + "named-entity-without-semicolon" => + _("Named entity didn't end with ';'."), + "expected-named-entity" => + _("Named entity expected. Got none."), + "attributes-in-end-tag" => + _("End tag contains unexpected attributes."), + "expected-tag-name-but-got-right-bracket" => + _("Expected tag name. Got '>' instead."), + "expected-tag-name-but-got-question-mark" => + _("Expected tag name. Got '?' instead. (HTML doesn't " + + "support processing instructions.)"), + "expected-tag-name" => + _("Expected tag name. Got something else instead"), + "expected-closing-tag-but-got-right-bracket" => + _("Expected closing tag. Got '>' instead. Ignoring ''."), + "expected-closing-tag-but-got-eof" => + _("Expected closing tag. Unexpected end of file."), + "expected-closing-tag-but-got-char" => + _("Expected closing tag. Unexpected character '%(data)' found."), + "eof-in-tag-name" => + _("Unexpected end of file in the tag name."), + "expected-attribute-name-but-got-eof" => + _("Unexpected end of file. Expected attribute name instead."), + "eof-in-attribute-name" => + _("Unexpected end of file in attribute name."), + "duplicate-attribute" => + _("Dropped duplicate attribute on tag."), + "expected-end-of-tag-name-but-got-eof" => + _("Unexpected end of file. Expected = or end of tag."), + "expected-attribute-value-but-got-eof" => + _("Unexpected end of file. Expected attribute value."), + "eof-in-attribute-value-double-quote" => + _("Unexpected end of file in attribute value (\")."), + "eof-in-attribute-value-single-quote" => + _("Unexpected end of file in attribute value (')."), + "eof-in-attribute-value-no-quotes" => + _("Unexpected end of file in attribute value."), + "expected-dashes-or-doctype" => + _("Expected '--' or 'DOCTYPE'. Not found."), + "incorrect-comment" => + _("Incorrect comment."), + "eof-in-comment" => + _("Unexpected end of file in comment."), + "eof-in-comment-end-dash" => + _("Unexpected end of file in comment (-)"), + "unexpected-dash-after-double-dash-in-comment" => + _("Unexpected '-' after '--' found in comment."), + "eof-in-comment-double-dash" => + _("Unexpected end of file in comment (--)."), + "unexpected-char-in-comment" => + _("Unexpected character in comment found."), + "need-space-after-doctype" => + _("No space after literal string 'DOCTYPE'."), + "expected-doctype-name-but-got-right-bracket" => + _("Unexpected > character. Expected DOCTYPE name."), + "expected-doctype-name-but-got-eof" => + _("Unexpected end of file. Expected DOCTYPE name."), + "eof-in-doctype-name" => + _("Unexpected end of file in DOCTYPE name."), + "eof-in-doctype" => + _("Unexpected end of file in DOCTYPE."), + "expected-space-or-right-bracket-in-doctype" => + _("Expected space or '>'. Got '%(data)'"), + "unexpected-end-of-doctype" => + _("Unexpected end of DOCTYPE."), + "unexpected-char-in-doctype" => + _("Unexpected character in DOCTYPE."), + "eof-in-bogus-doctype" => + _("Unexpected end of file in bogus doctype."), + "eof-in-innerhtml" => + _("XXX innerHTML EOF"), + "unexpected-doctype" => + _("Unexpected DOCTYPE. Ignored."), + "non-html-root" => + _("html needs to be the first start tag."), + "expected-doctype-but-got-eof" => + _("Unexpected End of file. Expected DOCTYPE."), + "unknown-doctype" => + _("Erroneous DOCTYPE."), + "expected-doctype-but-got-chars" => + _("Unexpected non-space characters. Expected DOCTYPE."), + "expected-doctype-but-got-start-tag" => + _("Unexpected start tag (%(name)). Expected DOCTYPE."), + "expected-doctype-but-got-end-tag" => + _("Unexpected end tag (%(name)). Expected DOCTYPE."), + "end-tag-after-implied-root" => + _("Unexpected end tag (%(name)) after the (implied) root element."), + "expected-named-closing-tag-but-got-eof" => + _("Unexpected end of file. Expected end tag (%(name))."), + "two-heads-are-not-better-than-one" => + _("Unexpected start tag head in existing head. Ignored."), + "unexpected-end-tag" => + _("Unexpected end tag (%(name)). Ignored."), + "unexpected-start-tag-out-of-my-head" => + _("Unexpected start tag (%(name)) that can be in head. Moved."), + "unexpected-start-tag" => + _("Unexpected start tag (%(name))."), + "missing-end-tag" => + _("Missing end tag (%(name))."), + "missing-end-tags" => + _("Missing end tags (%(name))."), + "unexpected-start-tag-implies-end-tag" => + _("Unexpected start tag (%(startName)) " + + "implies end tag (%(endName))."), + "unexpected-start-tag-treated-as" => + _("Unexpected start tag (%(originalName)). Treated as %(newName)."), + "deprecated-tag" => + _("Unexpected start tag %(name). Don't use it!"), + "unexpected-start-tag-ignored" => + _("Unexpected start tag %(name). Ignored."), + "expected-one-end-tag-but-got-another" => + _("Unexpected end tag (%(gotName)). " + + "Missing end tag (%(expectedName))."), + "end-tag-too-early" => + _("End tag (%(name)) seen too early. Expected other end tag."), + "end-tag-too-early-named" => + _("Unexpected end tag (%(gotName)). Expected end tag (%(expectedName))."), + "end-tag-too-early-ignored" => + _("End tag (%(name)) seen too early. Ignored."), + "adoption-agency-1.1" => + _("End tag (%(name)) violates step 1, " + + "paragraph 1 of the adoption agency algorithm."), + "adoption-agency-1.2" => + _("End tag (%(name)) violates step 1, " + + "paragraph 2 of the adoption agency algorithm."), + "adoption-agency-1.3" => + _("End tag (%(name)) violates step 1, " + + "paragraph 3 of the adoption agency algorithm."), + "unexpected-end-tag-treated-as" => + _("Unexpected end tag (%(originalName)). Treated as %(newName)."), + "no-end-tag" => + _("This element (%(name)) has no end tag."), + "unexpected-implied-end-tag-in-table" => + _("Unexpected implied end tag (%(name)) in the table phase."), + "unexpected-implied-end-tag-in-table-body" => + _("Unexpected implied end tag (%(name)) in the table body phase."), + "unexpected-char-implies-table-voodoo" => + _("Unexpected non-space characters in " + + "table context caused voodoo mode."), + "unexpected-start-tag-implies-table-voodoo" => + _("Unexpected start tag (%(name)) in " + + "table context caused voodoo mode."), + "unexpected-end-tag-implies-table-voodoo" => + _("Unexpected end tag (%(name)) in " + + "table context caused voodoo mode."), + "unexpected-cell-in-table-body" => + _("Unexpected table cell start tag (%(name)) " + + "in the table body phase."), + "unexpected-cell-end-tag" => + _("Got table cell end tag (%(name)) " + + "while required end tags are missing."), + "unexpected-end-tag-in-table-body" => + _("Unexpected end tag (%(name)) in the table body phase. Ignored."), + "unexpected-implied-end-tag-in-table-row" => + _("Unexpected implied end tag (%(name)) in the table row phase."), + "unexpected-end-tag-in-table-row" => + _("Unexpected end tag (%(name)) in the table row phase. Ignored."), + "unexpected-select-in-select" => + _("Unexpected select start tag in the select phase " + + "implies select start tag."), + "unexpected-start-tag-in-select" => + _("Unexpected start tag token (%(name) in the select phase. " + + "Ignored."), + "unexpected-end-tag-in-select" => + _("Unexpected end tag (%(name)) in the select phase. Ignored."), + "unexpected-char-after-body" => + _("Unexpected non-space characters in the after body phase."), + "unexpected-start-tag-after-body" => + _("Unexpected start tag token (%(name))" + + " in the after body phase."), + "unexpected-end-tag-after-body" => + _("Unexpected end tag token (%(name))" + + " in the after body phase."), + "unexpected-char-in-frameset" => + _("Unepxected characters in the frameset phase. Characters ignored."), + "unexpected-start-tag-in-frameset" => + _("Unexpected start tag token (%(name))" + + " in the frameset phase. Ignored."), + "unexpected-frameset-in-frameset-innerhtml" => + _("Unexpected end tag token (frameset) " + + "in the frameset phase (innerHTML)."), + "unexpected-end-tag-in-frameset" => + _("Unexpected end tag token (%(name))" + + " in the frameset phase. Ignored."), + "unexpected-char-after-frameset" => + _("Unexpected non-space characters in the " + + "after frameset phase. Ignored."), + "unexpected-start-tag-after-frameset" => + _("Unexpected start tag (%(name))" + + " in the after frameset phase. Ignored."), + "unexpected-end-tag-after-frameset" => + _("Unexpected end tag (%(name))" + + " in the after frameset phase. Ignored."), + "expected-eof-but-got-char" => + _("Unexpected non-space characters. Expected end of file."), + "expected-eof-but-got-start-tag" => + _("Unexpected start tag (%(name))" + + ". Expected end of file."), + "expected-eof-but-got-end-tag" => + _("Unexpected end tag (%(name))" + + ". Expected end of file."), + } + end diff --git a/vendor/plugins/HTML5lib/lib/html5/filters/iso639codes.rb b/vendor/plugins/HTML5lib/lib/html5/filters/iso639codes.rb new file mode 100755 index 00000000..ce3c9623 --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5/filters/iso639codes.rb @@ -0,0 +1,752 @@ +# borrowed from feedvalidator, original copyright license is +# +# Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +ISO_LANG = { + 'aa' => 'Afar', + 'ab' => 'Abkhazian', + 'ae' => 'Avestan', + 'af' => 'Afrikaans', + 'ak' => 'Akan', + 'am' => 'Amharic', + 'an' => 'Aragonese', + 'ar' => 'Arabic', + 'as' => 'Assamese', + 'av' => 'Avaric', + 'ay' => 'Aymara', + 'az' => 'Azerbaijani', + 'ba' => 'Bashkir', + 'be' => 'Byelorussian', + 'bg' => 'Bulgarian', + 'bh' => 'Bihari', + 'bi' => 'Bislama', + 'bm' => 'Bambara', + 'bn' => 'Bengali;Bangla', + 'bo' => 'Tibetan', + 'br' => 'Breton', + 'bs' => 'Bosnian', + 'ca' => 'Catalan', + 'ce' => 'Chechen', + 'ch' => 'Chamorro', + 'co' => 'Corsican', + 'cr' => 'Cree', + 'cs' => 'Czech', + 'cu' => 'Church Slavic', + 'cv' => 'Chuvash', + 'cy' => 'Welsh', + 'da' => 'Danish', + 'de' => 'German', + 'dv' => 'Divehi', + 'dz' => 'Dzongkha', + 'ee' => 'Ewe', + 'el' => 'Greek', + 'en' => 'English', + 'eo' => 'Esperanto', + 'es' => 'Spanish', + 'et' => 'Estonian', + 'eu' => 'Basque', + 'fa' => 'Persian (Farsi)', + 'ff' => 'Fulah', + 'fi' => 'Finnish', + 'fj' => 'Fiji', + 'fo' => 'Faroese', + 'fr' => 'French', + 'fy' => 'Frisian, Western', + 'ga' => 'Irish', + 'gd' => 'Scots Gaelic', + 'gl' => 'Galician', + 'gn' => 'Guarani', + 'gu' => 'Gujarati', + 'gv' => 'Manx', + 'ha' => 'Hausa', + 'he' => 'Hebrew', + 'hi' => 'Hindi', + 'ho' => 'Hiri Motu', + 'hr' => 'Croatian', + 'ht' => 'Haitian', + 'hu' => 'Hungarian', + 'hy' => 'Armenian', + 'hz' => 'Herero', + 'ia' => 'Interlingua', + 'id' => 'Indonesian', + 'ie' => 'Interlingue', + 'ig' => 'Igbo', + 'ii' => 'Sichuan Yi', + 'ik' => 'Inupiak', + 'io' => 'Ido', + 'is' => 'Icelandic', + 'it' => 'Italian', + 'iu' => 'Inuktitut', + 'ja' => 'Japanese', + 'jv' => 'Javanese', + 'ka' => 'Georgian', + 'kg' => 'Kongo', + 'ki' => 'Kikuyu; Gikuyu', + 'kj' => 'Kuanyama; Kwanyama', + 'kk' => 'Kazakh', + 'kl' => 'Greenlandic', + 'km' => 'Cambodian', + 'kn' => 'Kannada', + 'ko' => 'Korean', + 'kr' => 'Kanuri', + 'ks' => 'Kashmiri', + 'ku' => 'Kurdish', + 'kv' => 'Komi', + 'kw' => 'Cornish', + 'ky' => 'Kirghiz', + 'la' => 'Latin', + 'lb' => 'Letzeburgesch; Luxembourgish', + 'lg' => 'Ganda', + 'li' => 'Limburgan; Limburger, Limburgish', + 'ln' => 'Lingala', + 'lo' => 'Lao', + 'lt' => 'Lithuanian', + 'lu' => 'Luba-Katanga', + 'lv' => 'Latvian', + 'mg' => 'Malagasy', + 'mh' => 'Marshallese', + 'mi' => 'Maori', + 'mk' => 'Macedonian', + 'ml' => 'Malayalam', + 'mn' => 'Mongolian', + 'mo' => 'Moldavian', + 'mr' => 'Marathi', + 'ms' => 'Malay', + 'mt' => 'Maltese', + 'my' => 'Burmese', + 'na' => 'Nauru', + 'nb' => 'Norwegian Bokmal', + 'nd' => 'Ndebele, North', + 'ne' => 'Nepali', + 'ng' => 'Ndonga', + 'nl' => 'Dutch', + 'nn' => 'Norwegian Nynorsk', + 'no' => 'Norwegian', + 'nr' => 'Ndebele, South', + 'nv' => 'Navaho; Navajo', + 'ny' => 'Chewa; Chichewa; Nyanha', + 'oc' => 'Occitan', + 'oj' => 'Ojibwa', + 'om' => 'Afan (Oromo)', + 'or' => 'Oriya', + 'os' => 'Ossetian; Ossetic', + 'pa' => 'Punjabi', + 'pi' => 'Pali', + 'pl' => 'Polish', + 'ps' => 'Pushto', + 'pt' => 'Portuguese', + 'qu' => 'Quechua', + 'rm' => 'Rhaeto-Romance', + 'rn' => 'Kurundi', + 'ro' => 'Romanian', + 'ru' => 'Russian', + 'rw' => 'Kinyarwanda', + 'sa' => 'Sanskrit', + 'sc' => 'Sardinian', + 'sd' => 'Sindhi', + 'se' => 'Northern Sami', + 'sg' => 'Sangho', + 'sh' => 'Serbo-Croatian', + 'si' => 'Singhalese', + 'sk' => 'Slovak', + 'sl' => 'Slovenian', + 'sm' => 'Samoan', + 'sn' => 'Shona', + 'so' => 'Somali', + 'sq' => 'Albanian', + 'sr' => 'Serbian', + 'ss' => 'Swati', + 'st' => 'Sotho, Southern', + 'su' => 'Sundanese', + 'sv' => 'Swedish', + 'sw' => 'Swahili', + 'ta' => 'Tamil', + 'te' => 'Telugu', + 'tg' => 'Tajik', + 'th' => 'Thai', + 'ti' => 'Tigrinya', + 'tk' => 'Turkmen', + 'tl' => 'Tagalog', + 'tn' => 'Tswana', + 'to' => 'Tonga', + 'tr' => 'Turkish', + 'ts' => 'Tsonga', + 'tt' => 'Tatar', + 'tw' => 'Twi', + 'ty' => 'Tahitian', + 'ug' => 'Uigur', + 'uk' => 'Ukrainian', + 'ur' => 'Urdu', + 'uz' => 'Uzbek', + 've' => 'Venda', + 'vi' => 'Vietnamese', + 'vo' => 'Volapuk', + 'wa' => 'Walloon', + 'wo' => 'Wolof', + 'xh' => 'Xhosa', + 'yi' => 'Yiddish', + 'yo' => 'Yoruba', + 'za' => 'Zhuang', + 'zh' => 'Chinese', + 'zu' => 'Zulu', + 'x' => 'a user-defined language', + 'xx' => 'a user-defined language', + + 'abk' => 'Abkhazian', + 'ace' => 'Achinese', + 'ach' => 'Acoli', + 'ada' => 'Adangme', + 'ady' => 'Adygei', + 'ady' => 'Adyghe', + 'aar' => 'Afar', + 'afh' => 'Afrihili', + 'afr' => 'Afrikaans', + 'afa' => 'Afro-Asiatic (Other)', + 'ain' => 'Ainu', + 'aka' => 'Akan', + 'akk' => 'Akkadian', + 'alb' => 'Albanian', + 'sqi' => 'Albanian', + 'gws' => 'Alemanic', + 'ale' => 'Aleut', + 'alg' => 'Algonquian languages', + 'tut' => 'Altaic (Other)', + 'amh' => 'Amharic', + 'anp' => 'Angika', + 'apa' => 'Apache languages', + 'ara' => 'Arabic', + 'arg' => 'Aragonese', + 'arc' => 'Aramaic', + 'arp' => 'Arapaho', + 'arn' => 'Araucanian', + 'arw' => 'Arawak', + 'arm' => 'Armenian', + 'hye' => 'Armenian', + 'rup' => 'Aromanian', + 'art' => 'Artificial (Other)', + 'asm' => 'Assamese', + 'ast' => 'Asturian', + 'ath' => 'Athapascan languages', + 'aus' => 'Australian languages', + 'map' => 'Austronesian (Other)', + 'ava' => 'Avaric', + 'ave' => 'Avestan', + 'awa' => 'Awadhi', + 'aym' => 'Aymara', + 'aze' => 'Azerbaijani', + 'ast' => 'Bable', + 'ban' => 'Balinese', + 'bat' => 'Baltic (Other)', + 'bal' => 'Baluchi', + 'bam' => 'Bambara', + 'bai' => 'Bamileke languages', + 'bad' => 'Banda', + 'bnt' => 'Bantu (Other)', + 'bas' => 'Basa', + 'bak' => 'Bashkir', + 'baq' => 'Basque', + 'eus' => 'Basque', + 'btk' => 'Batak (Indonesia)', + 'bej' => 'Beja', + 'bel' => 'Belarusian', + 'bem' => 'Bemba', + 'ben' => 'Bengali', + 'ber' => 'Berber (Other)', + 'bho' => 'Bhojpuri', + 'bih' => 'Bihari', + 'bik' => 'Bikol', + 'byn' => 'Bilin', + 'bin' => 'Bini', + 'bis' => 'Bislama', + 'byn' => 'Blin', + 'nob' => 'Bokmal, Norwegian', + 'bos' => 'Bosnian', + 'bra' => 'Braj', + 'bre' => 'Breton', + 'bug' => 'Buginese', + 'bul' => 'Bulgarian', + 'bua' => 'Buriat', + 'bur' => 'Burmese', + 'mya' => 'Burmese', + 'cad' => 'Caddo', + 'car' => 'Carib', + 'spa' => 'Castilian', + 'cat' => 'Catalan', + 'cau' => 'Caucasian (Other)', + 'ceb' => 'Cebuano', + 'cel' => 'Celtic (Other)', + 'cai' => 'Central American Indian (Other)', + 'chg' => 'Chagatai', + 'cmc' => 'Chamic languages', + 'cha' => 'Chamorro', + 'che' => 'Chechen', + 'chr' => 'Cherokee', + 'nya' => 'Chewa', + 'chy' => 'Cheyenne', + 'chb' => 'Chibcha', + 'nya' => 'Chichewa', + 'chi' => 'Chinese', + 'zho' => 'Chinese', + 'chn' => 'Chinook jargon', + 'chp' => 'Chipewyan', + 'cho' => 'Choctaw', + 'zha' => 'Chuang', + 'chu' => 'Church Slavic; Church Slavonic; Old Church Slavonic; Old Church Slavic; Old Bulgarian', + 'chk' => 'Chuukese', + 'chv' => 'Chuvash', + 'nwc' => 'Classical Nepal Bhasa; Classical Newari; Old Newari', + 'cop' => 'Coptic', + 'cor' => 'Cornish', + 'cos' => 'Corsican', + 'cre' => 'Cree', + 'mus' => 'Creek', + 'crp' => 'Creoles and pidgins(Other)', + 'cpe' => 'Creoles and pidgins, English-based (Other)', + 'cpf' => 'Creoles and pidgins, French-based (Other)', + 'cpp' => 'Creoles and pidgins, Portuguese-based (Other)', + 'crh' => 'Crimean Tatar; Crimean Turkish', + 'scr' => 'Croatian', + 'hrv' => 'Croatian', + 'cus' => 'Cushitic (Other)', + 'cze' => 'Czech', + 'ces' => 'Czech', + 'dak' => 'Dakota', + 'dan' => 'Danish', + 'dar' => 'Dargwa', + 'day' => 'Dayak', + 'del' => 'Delaware', + 'din' => 'Dinka', + 'div' => 'Divehi', + 'doi' => 'Dogri', + 'dgr' => 'Dogrib', + 'dra' => 'Dravidian (Other)', + 'dua' => 'Duala', + 'dut' => 'Dutch', + 'nld' => 'Dutch', + 'dum' => 'Dutch, Middle (ca. 1050-1350)', + 'dyu' => 'Dyula', + 'dzo' => 'Dzongkha', + 'efi' => 'Efik', + 'egy' => 'Egyptian (Ancient)', + 'eka' => 'Ekajuk', + 'elx' => 'Elamite', + 'eng' => 'English', + 'enm' => 'English, Middle (1100-1500)', + 'ang' => 'English, Old (ca.450-1100)', + 'myv' => 'Erzya', + 'epo' => 'Esperanto', + 'est' => 'Estonian', + 'ewe' => 'Ewe', + 'ewo' => 'Ewondo', + 'fan' => 'Fang', + 'fat' => 'Fanti', + 'fao' => 'Faroese', + 'fij' => 'Fijian', + 'fil' => 'Filipino; Pilipino', + 'fin' => 'Finnish', + 'fiu' => 'Finno-Ugrian (Other)', + 'fon' => 'Fon', + 'fre' => 'French', + 'fra' => 'French', + 'frm' => 'French, Middle (ca.1400-1600)', + 'fro' => 'French, Old (842-ca.1400)', + 'frs' => 'Frisian, Eastern', + 'fry' => 'Frisian, Western', + 'fur' => 'Friulian', + 'ful' => 'Fulah', + 'gaa' => 'Ga', + 'gla' => 'Gaelic', + 'glg' => 'Gallegan', + 'lug' => 'Ganda', + 'gay' => 'Gayo', + 'gba' => 'Gbaya', + 'gez' => 'Geez', + 'geo' => 'Georgian', + 'kat' => 'Georgian', + 'ger' => 'German', + 'deu' => 'German', + 'nds' => 'German, Low', + 'gmh' => 'German, Middle High (ca.1050-1500)', + 'goh' => 'German, Old High (ca.750-1050)', + 'gem' => 'Germanic (Other)', + 'kik' => 'Gikuyu', + 'gil' => 'Gilbertese', + 'gon' => 'Gondi', + 'gor' => 'Gorontalo', + 'got' => 'Gothic', + 'grb' => 'Grebo', + 'grc' => 'Greek, Ancient (to 1453)', + 'gre' => 'Greek, Modern (1453-)', + 'ell' => 'Greek, Modern (1453-)', + 'kal' => 'Greenlandic; Kalaallisut', + 'grn' => 'Guarani', + 'guj' => 'Gujarati', + 'gwi' => 'Gwich\'in', + 'hai' => 'Haida', + 'hat' => 'Haitian', + 'hau' => 'Hausa', + 'haw' => 'Hawaiian', + 'heb' => 'Hebrew', + 'her' => 'Herero', + 'hil' => 'Hiligaynon', + 'him' => 'Himachali', + 'hin' => 'Hindi', + 'hmo' => 'Hiri Motu', + 'hit' => 'Hittite', + 'hmn' => 'Hmong', + 'hun' => 'Hungarian', + 'hup' => 'Hupa', + 'iba' => 'Iban', + 'ice' => 'Icelandic', + 'isl' => 'Icelandic', + 'ido' => 'Ido', + 'ibo' => 'Igbo', + 'ijo' => 'Ijo', + 'ilo' => 'Iloko', + 'smn' => 'Inari Sami', + 'inc' => 'Indic (Other)', + 'ine' => 'Indo-European (Other)', + 'ind' => 'Indonesian', + 'inh' => 'Ingush', + 'ina' => 'Interlingua (International Auxiliary Language Association)', + 'ile' => 'Interlingue', + 'iku' => 'Inuktitut', + 'ipk' => 'Inupiaq', + 'ira' => 'Iranian (Other)', + 'gle' => 'Irish', + 'mga' => 'Irish, Middle (900-1200)', + 'sga' => 'Irish, Old (to 900)', + 'iro' => 'Iroquoian languages', + 'ita' => 'Italian', + 'jpn' => 'Japanese', + 'jav' => 'Javanese', + 'jrb' => 'Judeo-Arabic', + 'jpr' => 'Judeo-Persian', + 'kbd' => 'Kabardian', + 'kab' => 'Kabyle', + 'kac' => 'Kachin', + 'kal' => 'Kalaallisut', + 'xal' => 'Kalmyk', + 'kam' => 'Kamba', + 'kan' => 'Kannada', + 'kau' => 'Kanuri', + 'krc' => 'Karachay-Balkar', + 'kaa' => 'Kara-Kalpak', + 'krl' => 'Karelian', + 'kar' => 'Karen', + 'kas' => 'Kashmiri', + 'csb' => 'Kashubian', + 'kaw' => 'Kawi', + 'kaz' => 'Kazakh', + 'kha' => 'Khasi', + 'khm' => 'Khmer', + 'khi' => 'Khoisan (Other)', + 'kho' => 'Khotanese', + 'kik' => 'Kikuyu', + 'kmb' => 'Kimbundu', + 'kin' => 'Kinyarwanda', + 'kir' => 'Kirghiz', + 'tlh' => 'Klingon; tlhIngan-Hol', + 'kom' => 'Komi', + 'kon' => 'Kongo', + 'kok' => 'Konkani', + 'kor' => 'Korean', + 'kos' => 'Kosraean', + 'kpe' => 'Kpelle', + 'kro' => 'Kru', + 'kua' => 'Kuanyama', + 'kum' => 'Kumyk', + 'kur' => 'Kurdish', + 'kru' => 'Kurukh', + 'kut' => 'Kutenai', + 'kua' => 'Kwanyama', + 'lad' => 'Ladino', + 'lah' => 'Lahnda', + 'lam' => 'Lamba', + 'lao' => 'Lao', + 'lat' => 'Latin', + 'lav' => 'Latvian', + 'ltz' => 'Letzeburgesch', + 'lez' => 'Lezghian', + 'lim' => 'Limburgan', + 'lin' => 'Lingala', + 'lit' => 'Lithuanian', + 'jbo' => 'Lojban', + 'nds' => 'Low German', + 'dsb' => 'Lower Sorbian', + 'loz' => 'Lozi', + 'lub' => 'Luba-Katanga', + 'lua' => 'Luba-Lulua', + 'lui' => 'Luiseno', + 'smj' => 'Lule Sami', + 'lun' => 'Lunda', + 'luo' => 'Luo (Kenya and Tanzania)', + 'lus' => 'Lushai', + 'ltz' => 'Luxembourgish', + 'mac' => 'Macedonian', + 'mkd' => 'Macedonian', + 'mad' => 'Madurese', + 'mag' => 'Magahi', + 'mai' => 'Maithili', + 'mak' => 'Makasar', + 'mlg' => 'Malagasy', + 'may' => 'Malay', + 'msa' => 'Malay', + 'mal' => 'Malayalam', + 'mlt' => 'Maltese', + 'mnc' => 'Manchu', + 'mdr' => 'Mandar', + 'man' => 'Mandingo', + 'mni' => 'Manipuri', + 'mno' => 'Manobo languages', + 'glv' => 'Manx', + 'mao' => 'Maori', + 'mri' => 'Maori', + 'mar' => 'Marathi', + 'chm' => 'Mari', + 'mah' => 'Marshallese', + 'mwr' => 'Marwari', + 'mas' => 'Masai', + 'myn' => 'Mayan languages', + 'men' => 'Mende', + 'mic' => 'Micmac', + 'min' => 'Minangkabau', + 'mwl' => 'Mirandese', + 'mis' => 'Miscellaneous languages', + 'moh' => 'Mohawk', + 'mdf' => 'Moksha', + 'mol' => 'Moldavian', + 'mkh' => 'Mon-Khmer (Other)', + 'lol' => 'Mongo', + 'mon' => 'Mongolian', + 'mos' => 'Mossi', + 'mul' => 'Multiple languages', + 'mun' => 'Munda languages', + 'nah' => 'Nahuatl', + 'nau' => 'Nauru', + 'nav' => 'Navaho; Navajo', + 'nde' => 'Ndebele, North', + 'nbl' => 'Ndebele, South', + 'ndo' => 'Ndonga', + 'nap' => 'Neapolitan', + 'nep' => 'Nepali', + 'new' => 'Newari', + 'nia' => 'Nias', + 'nic' => 'Niger-Kordofanian (Other)', + 'ssa' => 'Nilo-Saharan (Other)', + 'niu' => 'Niuean', + 'nog' => 'Nogai', + 'non' => 'Norse, Old', + 'nai' => 'North American Indian (Other)', + 'frr' => 'Northern Frisian', + 'sme' => 'Northern Sami', + 'nso' => 'Northern Sotho; Pedi; Sepedi', + 'nde' => 'North Ndebele', + 'nor' => 'Norwegian', + 'nob' => 'Norwegian Bokmal', + 'nno' => 'Norwegian Nynorsk', + 'nub' => 'Nubian languages', + 'nym' => 'Nyamwezi', + 'nya' => 'Nyanja', + 'nyn' => 'Nyankole', + 'nno' => 'Nynorsk, Norwegian', + 'nyo' => 'Nyoro', + 'nzi' => 'Nzima', + 'oci' => 'Occitan (post 1500)', + 'oji' => 'Ojibwa', + 'ori' => 'Oriya', + 'orm' => 'Oromo', + 'osa' => 'Osage', + 'oss' => 'Ossetian; Ossetic', + 'oto' => 'Otomian languages', + 'pal' => 'Pahlavi', + 'pau' => 'Palauan', + 'pli' => 'Pali', + 'pam' => 'Pampanga', + 'pag' => 'Pangasinan', + 'pan' => 'Panjabi', + 'pap' => 'Papiamento', + 'paa' => 'Papuan (Other)', + 'per' => 'Persian', + 'fas' => 'Persian', + 'peo' => 'Persian, Old (ca.600-400)', + 'phi' => 'Philippine (Other)', + 'phn' => 'Phoenician', + 'pon' => 'Pohnpeian', + 'pol' => 'Polish', + 'por' => 'Portuguese', + 'pra' => 'Prakrit languages', + 'oci' => 'Provencal', + 'pro' => 'Provencal, Old (to 1500)', + 'pan' => 'Punjabi', + 'pus' => 'Pushto', + 'que' => 'Quechua', + 'roh' => 'Raeto-Romance', + 'raj' => 'Rajasthani', + 'rap' => 'Rapanui', + 'rar' => 'Rarotongan', + 'qaa' => 'Reserved for local use', + 'qtz' => 'Reserved for local use', + 'roa' => 'Romance (Other)', + 'rum' => 'Romanian', + 'ron' => 'Romanian', + 'rom' => 'Romany', + 'run' => 'Rundi', + 'rus' => 'Russian', + 'sal' => 'Salishan languages', + 'sam' => 'Samaritan Aramaic', + 'smi' => 'Sami languages (Other)', + 'smo' => 'Samoan', + 'sad' => 'Sandawe', + 'sag' => 'Sango', + 'san' => 'Sanskrit', + 'sat' => 'Santali', + 'srd' => 'Sardinian', + 'sas' => 'Sasak', + 'nds' => 'Saxon, Low', + 'sco' => 'Scots', + 'gla' => 'Scottish Gaelic', + 'sel' => 'Selkup', + 'sem' => 'Semitic (Other)', + 'nso' => 'Sepedi; Northern Sotho; Pedi', + 'scc' => 'Serbian', + 'srp' => 'Serbian', + 'srr' => 'Serer', + 'shn' => 'Shan', + 'sna' => 'Shona', + 'iii' => 'Sichuan Yi', + 'scn' => 'Sicilian', + 'sid' => 'Sidamo', + 'sgn' => 'Sign languages', + 'bla' => 'Siksika', + 'snd' => 'Sindhi', + 'sin' => 'Sinhalese', + 'sit' => 'Sino-Tibetan (Other)', + 'sio' => 'Siouan languages', + 'sms' => 'Skolt Sami', + 'den' => 'Slave (Athapascan)', + 'sla' => 'Slavic (Other)', + 'slo' => 'Slovak', + 'slk' => 'Slovak', + 'slv' => 'Slovenian', + 'sog' => 'Sogdian', + 'som' => 'Somali', + 'son' => 'Songhai', + 'snk' => 'Soninke', + 'wen' => 'Sorbian languages', + 'nso' => 'Sotho, Northern', + 'sot' => 'Sotho, Southern', + 'sai' => 'South American Indian (Other)', + 'alt' => 'Southern Altai', + 'sma' => 'Southern Sami', + 'nbl' => 'South Ndebele', + 'spa' => 'Spanish', + 'srn' => 'Sranan Tongo', + 'suk' => 'Sukuma', + 'sux' => 'Sumerian', + 'sun' => 'Sundanese', + 'sus' => 'Susu', + 'swa' => 'Swahili', + 'ssw' => 'Swati', + 'swe' => 'Swedish', + 'gsw' => 'Swiss German; Alemanic', + 'syr' => 'Syriac', + 'tgl' => 'Tagalog', + 'tah' => 'Tahitian', + 'tai' => 'Tai (Other)', + 'tgk' => 'Tajik', + 'tmh' => 'Tamashek', + 'tam' => 'Tamil', + 'tat' => 'Tatar', + 'tel' => 'Telugu', + 'ter' => 'Tereno', + 'tet' => 'Tetum', + 'tha' => 'Thai', + 'tib' => 'Tibetan', + 'bod' => 'Tibetan', + 'tig' => 'Tigre', + 'tir' => 'Tigrinya', + 'tem' => 'Timne', + 'tiv' => 'Tiv', + 'tlh' => 'tlhIngan-Hol; Klingon', + 'tli' => 'Tlingit', + 'tpi' => 'Tok Pisin', + 'tkl' => 'Tokelau', + 'tog' => 'Tonga (Nyasa)', + 'ton' => 'Tonga (Tonga Islands)', + 'tsi' => 'Tsimshian', + 'tso' => 'Tsonga', + 'tsn' => 'Tswana', + 'tum' => 'Tumbuka', + 'tup' => 'Tupi languages', + 'tur' => 'Turkish', + 'ota' => 'Turkish, Ottoman (1500-1928)', + 'tuk' => 'Turkmen', + 'tvl' => 'Tuvalu', + 'tyv' => 'Tuvinian', + 'twi' => 'Twi', + 'udm' => 'Udmurt', + 'uga' => 'Ugaritic', + 'uig' => 'Uighur', + 'ukr' => 'Ukrainian', + 'umb' => 'Umbundu', + 'und' => 'Undetermined', + 'hsb' => 'Upper Sorbian', + 'urd' => 'Urdu', + 'uzb' => 'Uzbek', + 'vai' => 'Vai', + 'cat' => 'Valencian', + 'ven' => 'Venda', + 'vie' => 'Vietnamese', + 'vol' => 'Volapuk', + 'vot' => 'Votic', + 'wak' => 'Wakashan languages', + 'wal' => 'Walamo', + 'wln' => 'Walloon', + 'war' => 'Waray', + 'was' => 'Washo', + 'wel' => 'Welsh', + 'cym' => 'Welsh', + 'fry' => 'Wester Frisian', + 'wol' => 'Wolof', + 'xho' => 'Xhosa', + 'sah' => 'Yakut', + 'yao' => 'Yao', + 'yap' => 'Yapese', + 'yid' => 'Yiddish', + 'yor' => 'Yoruba', + 'ypk' => 'Yupik languages', + 'znd' => 'Zande', + 'zap' => 'Zapotec', + 'zen' => 'Zenaga', + 'zha' => 'Zhuang', + 'zul' => 'Zulu', + 'zun' => 'Zuni' +} + +def is_valid_lang_code(value) + if value.include? '-' + lang, sublang = value.split('-', 2) + else + lang = value + end + !!ISO_LANG[lang.downcase] +end \ No newline at end of file diff --git a/vendor/plugins/HTML5lib/lib/html5/filters/rfc2046.rb b/vendor/plugins/HTML5lib/lib/html5/filters/rfc2046.rb new file mode 100755 index 00000000..593baf42 --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5/filters/rfc2046.rb @@ -0,0 +1,30 @@ +# adapted from feedvalidator, original copyright license is +# +# Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +# mime_re = Regexp.new('[^\s()<>,;:\\"/[\]?=]+/[^\s()<>,;:\\"/[\]?=]+(\s*;\s*[^\s()<>,;:\\"/[\]?=]+=("(\\"|[^"])*"|[^\s()<>,;:\\"/[\]?=]+))*$') + +def is_valid_mime_type(value) + # !!mime_re.match(value) + true +end + diff --git a/vendor/plugins/HTML5lib/lib/html5/filters/rfc3987.rb b/vendor/plugins/HTML5lib/lib/html5/filters/rfc3987.rb new file mode 100755 index 00000000..bf730e06 --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5/filters/rfc3987.rb @@ -0,0 +1,89 @@ +# adapted from feedvalidator, original copyright license is +# +# Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +iana_schemes = [ # http://www.iana.org/assignments/uri-schemes.html + "ftp", "http", "gopher", "mailto", "news", "nntp", "telnet", "wais", + "file", "prospero", "z39.50s", "z39.50r", "cid", "mid", "vemmi", + "service", "imap", "nfs", "acap", "rtsp", "tip", "pop", "data", "dav", + "opaquelocktoken", "sip", "sips", "tel", "fax", "modem", "ldap", + "https", "soap.beep", "soap.beeps", "xmlrpc.beep", "xmlrpc.beeps", + "urn", "go", "h323", "ipp", "tftp", "mupdate", "pres", "im", "mtqp", + "iris.beep", "dict", "snmp", "crid", "tag", "dns", "info" +] +ALLOWED_SCHEMES = iana_schemes + ['javascript'] + +RFC2396 = Regexp.new("^([a-zA-Z][0-9a-zA-Z+\\-\\.]*:)?/{0,2}[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]*$", Regexp::MULTILINE) +rfc2396_full = Regexp.new("[a-zA-Z][0-9a-zA-Z+\\-\\.]*:(//)?[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]+$") +URN = Regexp.new("^[Uu][Rr][Nn]:[a-zA-Z0-9][a-zA-Z0-9-]{1,31}:([a-zA-Z0-9()+,\.:=@;$_!*'\-]|%[0-9A-Fa-f]{2})+$") +TAG = Regexp.new("^tag:([a-z0-9\\-\._]+?@)?[a-z0-9\.\-]+?,\d{4}(-\d{2}(-\d{2})?)?:[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*(#[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*)?$") + +def is_valid_uri(value, uri_pattern = RFC2396) + scheme = value.split(':').first + scheme.downcase! if scheme + if scheme == 'tag' + if !TAG.match(value) + return false, "invalid-tag-uri" + end + elsif scheme == "urn" + if !URN.match(value) + return false, "invalid-urn" + end + elsif uri_pattern.match(value).to_a.reject{|i| i == ''}.compact.length == 0 || uri_pattern.match(value)[0] != value + urichars = Regexp.new("^[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]$", Regexp::MULTILINE) + if value.length > 0 + value.each_byte do |b| + if b < 128 and !urichars.match([b].pack('c*')) + return false, "invalid-uri-char" + end + end + else + begin + if uri_pattern.match(value.encode('idna')) + return false, "uri-not-iri" + end + rescue + end + return false, "invalid-uri" + end + elsif ['http','ftp'].include?(scheme) + if !value.match(%r{^\w+://[^/].*}) + return false, "invalid-http-or-ftp-uri" + end + elsif value.index(':') && scheme.match(/^[a-z]+$/) && !ALLOWED_SCHEMES.include?(scheme) + return false, "invalid-scheme" + end + return true, "" +end + +def is_valid_iri(value) + begin + if value.length > 0 + value = value.encode('idna') + end + rescue + end + is_valid_uri(value) +end + +def is_valid_fully_qualified_uri(value) + is_valid_uri(value, rfc2396_full) +end diff --git a/vendor/plugins/HTML5lib/lib/html5/filters/validator.rb b/vendor/plugins/HTML5lib/lib/html5/filters/validator.rb new file mode 100755 index 00000000..99e67baa --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5/filters/validator.rb @@ -0,0 +1,830 @@ +# HTML 5 conformance checker +# +# Warning: this module is experimental, incomplete, and subject to removal at any time. +# +# Usage: +# >>> from html5lib.html5parser import HTMLParser +# >>> from html5lib.filters.validator import HTMLConformanceChecker +# >>> p = HTMLParser(tokenizer=HTMLConformanceChecker) +# >>> p.parse('\n') +# < nil> +# >>> p.errors +# [((2, 14), 'unknown-attribute', {'attributeName' => u'foo', 'tagName' => u'html'})] + +require 'html5/constants' +require 'html5/filters/base' +require 'html5/filters/iso639codes' +require 'html5/filters/rfc3987' +require 'html5/filters/rfc2046' + +def _(str); str; end + +class String + # lifted from rails + def underscore() + self.gsub(/::/, '/'). + gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2'). + gsub(/([a-z\d])([A-Z])/,'\1_\2'). + tr("-", "_"). + downcase + end +end + +HTML5::E.update({ + "unknown-start-tag" => + _("Unknown start tag <%(tagName)>."), + "unknown-attribute" => + _("Unknown '%(attributeName)' attribute on <%(tagName)>."), + "missing-required-attribute" => + _("The '%(attributeName)' attribute is required on <%(tagName)>."), + "unknown-input-type" => + _("Illegal value for attribute on ."), + "attribute-not-allowed-on-this-input-type" => + _("The '%(attributeName)' attribute is not allowed on ."), + "deprecated-attribute" => + _("This attribute is deprecated: '%(attributeName)' attribute on <%(tagName)>."), + "duplicate-value-in-token-list" => + _("Duplicate value in token list: '%(attributeValue)' in '%(attributeName)' attribute on <%(tagName)>."), + "invalid-attribute-value" => + _("Invalid attribute value: '%(attributeName)' attribute on <%(tagName)>."), + "space-in-id" => + _("Whitespace is not allowed here: '%(attributeName)' attribute on <%(tagName)>."), + "duplicate-id" => + _("This ID was already defined earlier: 'id' attribute on <%(tagName)>."), + "attribute-value-can-not-be-blank" => + _("This value can not be blank: '%(attributeName)' attribute on <%(tagName)>."), + "id-does-not-exist" => + _("This value refers to a non-existent ID: '%(attributeName)' attribute on <%(tagName)>."), + "invalid-enumerated-value" => + _("Value must be one of %(enumeratedValues): '%(attributeName)' attribute on <%tagName)>."), + "invalid-boolean-value" => + _("Value must be one of %(enumeratedValues): '%(attributeName)' attribute on <%tagName)>."), + "contextmenu-must-point-to-menu" => + _("The contextmenu attribute must point to an ID defined on a element."), + "invalid-lang-code" => + _("Invalid language code: '%(attributeName)' attibute on <%(tagName)>."), + "invalid-integer-value" => + _("Value must be an integer: '%(attributeName)' attribute on <%tagName)>."), + "invalid-root-namespace" => + _("Root namespace must be 'http://www.w3.org/1999/xhtml', or omitted."), + "invalid-browsing-context" => + _("Value must be one of ('_self', '_parent', '_top'), or a name that does not start with '_' => '%(attributeName)' attribute on <%(tagName)>."), + "invalid-tag-uri" => + _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."), + "invalid-urn" => + _("Invalid URN: '%(attributeName)' attribute on <%(tagName)>."), + "invalid-uri-char" => + _("Illegal character in URI: '%(attributeName)' attribute on <%(tagName)>."), + "uri-not-iri" => + _("Expected a URI but found an IRI: '%(attributeName)' attribute on <%(tagName)>."), + "invalid-uri" => + _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."), + "invalid-http-or-ftp-uri" => + _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."), + "invalid-scheme" => + _("Unregistered URI scheme: '%(attributeName)' attribute on <%(tagName)>."), + "invalid-rel" => + _("Invalid link relation: '%(attributeName)' attribute on <%(tagName)>."), + "invalid-mime-type" => + _("Invalid MIME type: '%(attributeName)' attribute on <%(tagName)>."), +}) + + +class HTMLConformanceChecker < HTML5::Filters::Base + + @@global_attributes = %w[class contenteditable contextmenu dir + draggable id irrelevant lang ref tabindex template + title onabort onbeforeunload onblur onchange onclick + oncontextmenu ondblclick ondrag ondragend ondragenter + ondragleave ondragover ondragstart ondrop onerror + onfocus onkeydown onkeypress onkeyup onload onmessage + onmousedown onmousemove onmouseout onmouseover onmouseup + onmousewheel onresize onscroll onselect onsubmit onunload] + # XXX lang in HTML only, xml:lang in XHTML only + # XXX validate ref, template + + @@allowed_attribute_map = { + 'html' => %w[xmlns], + 'head' => [], + 'title' => [], + 'base' => %w[href target], + 'link' => %w[href rel media hreflang type], + 'meta' => %w[name http-equiv content charset], # XXX charset in HTML only + 'style' => %w[media type scoped], + 'body' => [], + 'section' => [], + 'nav' => [], + 'article' => [], + 'blockquote' => %w[cite], + 'aside' => [], + 'h1' => [], + 'h2' => [], + 'h3' => [], + 'h4' => [], + 'h5' => [], + 'h6' => [], + 'header' => [], + 'footer' => [], + 'address' => [], + 'p' => [], + 'hr' => [], + 'br' => [], + 'dialog' => [], + 'pre' => [], + 'ol' => %w[start], + 'ul' => [], + 'li' => %w[value], # XXX depends on parent + 'dl' => [], + 'dt' => [], + 'dd' => [], + 'a' => %w[href target ping rel media hreflang type], + 'q' => %w[cite], + 'cite' => [], + 'em' => [], + 'strong' => [], + 'small' => [], + 'm' => [], + 'dfn' => [], + 'abbr' => [], + 'time' => %w[datetime], + 'meter' => %w[value min low high max optimum], + 'progress' => %w[value max], + 'code' => [], + 'var' => [], + 'samp' => [], + 'kbd' => [], + 'sup' => [], + 'sub' => [], + 'span' => [], + 'i' => [], + 'b' => [], + 'bdo' => [], + 'ins' => %w[cite datetime], + 'del' => %w[cite datetime], + 'figure' => [], + 'img' => %w[alt src usemap ismap height width], # XXX ismap depends on parent + 'iframe' => %w[src], + # handled separately + 'object' => %w[data type usemap height width], + 'param' => %w[name value], + 'video' => %w[src autoplay start loopstart loopend end loopcount controls], + 'audio' => %w[src autoplay start loopstart loopend end loopcount controls], + 'source' => %w[src type media], + 'canvas' => %w[height width], + 'map' => [], + 'area' => %w[alt coords shape href target ping rel media hreflang type], + 'table' => [], + 'caption' => [], + 'colgroup' => %w[span], # XXX only if element contains no elements + 'col' => %w[span], + 'tbody' => [], + 'thead' => [], + 'tfoot' => [], + 'tr' => [], + 'td' => %w[colspan rowspan], + 'th' => %w[colspan rowspan scope], + # all possible attributes are listed here but is really handled separately + 'input' => %w[accept accesskey action alt autocomplete autofocus checked + disabled enctype form inputmode list maxlength method min + max name pattern step readonly replace required size src + tabindex target template value + ], + 'form' => %w[action method enctype accept name onsubmit onreset accept-charset + data replace + ], + 'button' => %w[action enctype method replace template name value type disabled form autofocus], # XXX may need matrix of acceptable attributes based on value of type attribute (like input) + 'select' => %w[name size multiple disabled data accesskey form autofocus], + 'optgroup' => %w[disabled label], + 'option' => %w[selected disabled label value], + 'textarea' => %w[maxlength name rows cols disabled readonly required form autofocus wrap accept], + 'label' => %w[for accesskey form], + 'fieldset' => %w[disabled form], + 'output' => %w[form name for onforminput onformchange], + 'datalist' => %w[data], + # XXX repetition model for repeating form controls + 'script' => %w[src defer async type], + 'noscript' => [], + 'noembed' => [], + 'event-source' => %w[src], + 'details' => %w[open], + 'datagrid' => %w[multiple disabled], + 'command' => %w[type label icon hidden disabled checked radiogroup default], + 'menu' => %w[type label autosubmit], + 'datatemplate' => [], + 'rule' => [], + 'nest' => [], + 'legend' => [], + 'div' => [], + 'font' => %w[style] + } + + @@required_attribute_map = { + 'link' => %w[href rel], + 'bdo' => %w[dir], + 'img' => %w[src], + 'embed' => %w[src], + 'object' => [], # XXX one of 'data' or 'type' is required + 'param' => %w[name value], + 'source' => %w[src], + 'map' => %w[id] + } + + @@input_type_allowed_attribute_map = { + 'text' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required size tabindex value], + 'password' => %w[accesskey autocomplete autofocus disabled form inputmode maxlength name pattern readonly required size tabindex value], + 'checkbox' => %w[accesskey autofocus checked disabled form name required tabindex value], + 'radio' => %w[accesskey autofocus checked disabled form name required tabindex value], + 'button' => %w[accesskey autofocus disabled form name tabindex value], + 'submit' => %w[accesskey action autofocus disabled enctype form method name replace tabindex target value], + 'reset' => %w[accesskey autofocus disabled form name tabindex value], + 'add' => %w[accesskey autofocus disabled form name tabindex template value], + 'remove' => %w[accesskey autofocus disabled form name tabindex value], + 'move-up' => %w[accesskey autofocus disabled form name tabindex value], + 'move-down' => %w[accesskey autofocus disabled form name tabindex value], + 'file' => %w[accept accesskey autofocus disabled form min max name required tabindex], + 'hidden' => %w[disabled form name value], + 'image' => %w[accesskey action alt autofocus disabled enctype form method name replace src tabindex target], + 'datetime' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value], + 'datetime-local' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value], + 'date' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value], + 'month' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value], + 'week' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value], + 'time' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value], + 'number' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value], + 'range' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value], + 'email' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required tabindex value], + 'url' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required tabindex value], + } + + @@input_type_deprecated_attribute_map = { + 'text' => ['size'], + 'password' => ['size'] + } + + @@link_rel_values = %w[alternate archive archives author contact feed first begin start help icon index top contents toc last end license copyright next pingback prefetch prev previous search stylesheet sidebar tag up] + @@a_rel_values = %w[alternate archive archives author contact feed first begin start help index top contents toc last end license copyright next prev previous search sidebar tag up bookmark external nofollow] + + def initialize(stream, *args) + super(HTML5::HTMLTokenizer.new(stream, *args)) + @things_that_define_an_id = [] + @things_that_point_to_an_id = [] + @ids_we_have_known_and_loved = [] + end + + def each + __getobj__.each do |token| + method = "validate_#{token.fetch(:type, '-').to_s.underscore}_#{token.fetch(:name, '-').to_s.underscore}" + if respond_to?(method) + send(method, token){|t| yield t } + else + method = "validate_#{token.fetch(:type, '-').to_s.underscore}" + if respond_to?(method) + send(method, token) do |t| + yield t + end + end + end + yield token + end + eof do |t| + yield t + end + end + + ########################################################################## + # Start tag validation + ########################################################################## + + def validate_start_tag(token) + check_unknown_start_tag(token){|t| yield t} + check_start_tag_required_attributes(token) do |t| + yield t + end + check_start_tag_unknown_attributes(token) do |t| + yield t + end + check_attribute_values(token) do |t| + yield t + end + end + + def validate_start_tag_embed(token) + check_start_tag_required_attributes(token) do |t| + yield t + end + check_attribute_values(token) do |t| + yield t + end + # spec says "any attributes w/o namespace" + # so don't call check_start_tag_unknown_attributes + end + + def validate_start_tag_input(token) + check_attribute_values(token) do |t| + yield t + end + attr_dict = Hash[*token[:data].collect{|(name, value)| [name.downcase, value]}.flatten] + input_type = attr_dict.fetch('type', "text") + if !@@input_type_allowed_attribute_map.keys().include?(input_type) + yield({:type => "ParseError", + :data => "unknown-input-type", + :datavars => {:attrValue => input_type}}) + end + allowed_attributes = @@input_type_allowed_attribute_map.fetch(input_type, []) + attr_dict.each do |attr_name, attr_value| + if !@@allowed_attribute_map['input'].include?(attr_name) + yield({:type => "ParseError", + :data => "unknown-attribute", + :datavars => {"tagName" => "input", + "attributeName" => attr_name}}) + elsif !allowed_attributes.include?(attr_name) + yield({:type => "ParseError", + :data => "attribute-not-allowed-on-this-input-type", + :datavars => {"attributeName" => attr_name, + "inputType" => input_type}}) + end + if @@input_type_deprecated_attribute_map.fetch(input_type, []).include?(attr_name) + yield({:type => "ParseError", + :data => "deprecated-attribute", + :datavars => {"attributeName" => attr_name, + "inputType" => input_type}}) + end + end + end + + ########################################################################## + # Start tag validation helpers + ########################################################################## + + def check_unknown_start_tag(token) + # check for recognized tag name + name = (token[:name] || "").downcase + if !@@allowed_attribute_map.keys.include?(name) + yield({:type => "ParseError", + :data => "unknown-start-tag", + :datavars => {"tagName" => name}}) + end + end + + def check_start_tag_required_attributes(token) + # check for presence of required attributes + name = (token[:name] || "").downcase + if @@required_attribute_map.keys().include?(name) + attrs_present = (token[:data] || []).collect{|t| t[0]} + for attr_name in @@required_attribute_map[name] + if !attrs_present.include?(attr_name) + yield( {:type => "ParseError", + :data => "missing-required-attribute", + :datavars => {"tagName" => name, + "attributeName" => attr_name}}) + end + end + end + end + + def check_start_tag_unknown_attributes(token) + # check for recognized attribute names + name = token[:name].downcase + allowed_attributes = @@global_attributes | @@allowed_attribute_map.fetch(name, []) + for attr_name, attr_value in token.fetch(:data, []) + if !allowed_attributes.include?(attr_name.downcase()) + yield( {:type => "ParseError", + :data => "unknown-attribute", + :datavars => {"tagName" => name, + "attributeName" => attr_name}}) + end + end + end + + ########################################################################## + # Attribute validation helpers + ########################################################################## + +# def checkURI(token, tag_name, attr_name, attr_value) +# is_valid, error_code = rfc3987.is_valid_uri(attr_value) +# if not is_valid +# yield {:type => "ParseError", +# :data => error_code, +# :datavars => {"tagName" => tag_name, +# "attributeName" => attr_name}} +# yield {:type => "ParseError", +# :data => "invalid-attribute-value", +# :datavars => {"tagName" => tag_name, +# "attributeName" => attr_name}} + + def check_iri(token, tag_name, attr_name, attr_value) + is_valid, error_code = is_valid_iri(attr_value) + if !is_valid + yield({:type => "ParseError", + :data => error_code, + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name}}) + yield({:type => "ParseError", + :data => "invalid-attribute-value", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name}}) + end + end + + def check_id(token, tag_name, attr_name, attr_value) + if !attr_value || attr_value.length == 0 + yield({:type => "ParseError", + :data => "attribute-value-can-not-be-blank", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name}}) + end + attr_value.each_byte do |b| + c = [b].pack('c*') + if HTML5::SPACE_CHARACTERS.include?(c) + yield( {:type => "ParseError", + :data => "space-in-id", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name}}) + yield( {:type => "ParseError", + :data => "invalid-attribute-value", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name}}) + break + end + end + end + + def parse_token_list(value) + valueList = [] + currentValue = '' + (value + ' ').each_byte do |b| + c = [b].pack('c*') + if HTML5::SPACE_CHARACTERS.include?(c) + if currentValue.length > 0 + valueList << currentValue + currentValue = '' + end + else + currentValue += c + end + end + if currentValue.length > 0 + valueList << currentValue + end + valueList + end + + def check_token_list(tag_name, attr_name, attr_value) + # The "token" in the method name refers to tokens in an attribute value + # i.e. http://www.whatwg.org/specs/web-apps/current-work/#set-of + # but the "token" parameter refers to the token generated from + # HTMLTokenizer. Sorry for the confusion. + value_list = parse_token_list(attr_value) + value_dict = {} + for current_value in value_list + if value_dict.has_key?(current_value) + yield({:type => "ParseError", + :data => "duplicate-value-in-token-list", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name, + "attributeValue" => current_value}}) + break + end + value_dict[current_value] = 1 + end + end + + def check_enumerated_value(token, tag_name, attr_name, attr_value, enumerated_values) + if !attr_value || attr_value.length == 0 + yield( {:type => "ParseError", + :data => "attribute-value-can-not-be-blank", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name}}) + return + end + attr_value.downcase! + if !enumerated_values.include?(attr_value) + yield( {:type => "ParseError", + :data => "invalid-enumerated-value", + :datavars => {"tagName" => tag_name, + "attribute_name" => attr_name, + "enumeratedValues" => enumerated_values}}) + yield( {:type => "ParseError", + :data => "invalid-attribute-value", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name}}) + end + end + + def check_boolean(token, tag_name, attr_name, attr_value) + enumerated_values = [attr_name, ''] + if !enumerated_values.include?(attr_value) + yield( {:type => "ParseError", + :data => "invalid-boolean-value", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name, + "enumeratedValues" => enumerated_values}}) + yield( {:type => "ParseError", + :data => "invalid-attribute-value", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name}}) + end + end + + def check_integer(token, tag_name, attr_name, attr_value) + sign = 1 + number_string = '' + state = 'begin' # ('begin', 'initial-number', 'number', 'trailing-junk') + error = {:type => "ParseError", + :data => "invalid-integer-value", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name, + "attributeValue" => attr_value}} + attr_value.scan(/./) do |c| + if state == 'begin' + if HTML5::SPACE_CHARACTERS.include?(c) + next + elsif c == '-' + sign = -1 + state = 'initial-number' + elsif HTML5::DIGITS.include?(c) + number_string += c + state = 'in-number' + else + yield error + return + end + elsif state == 'initial-number' + if !HTML5::DIGITS.include?(c) + yield error + return + end + number_string += c + state = 'in-number' + elsif state == 'in-number' + if HTML5::DIGITS.include?(c) + number_string += c + else + state = 'trailing-junk' + end + elsif state == 'trailing-junk' + next + end + end + if number_string.length == 0 + yield( {:type => "ParseError", + :data => "attribute-value-can-not-be-blank", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name}}) + end + end + + def check_floating_point_number(token, tag_name, attr_name, attr_value) + # XXX + end + + def check_browsing_context(token, tag_name, attr_name, attr_value) + return if not attr_value + return if attr_value[0] != ?_ + attr_value.downcase! + return if ['_self', '_parent', '_top', '_blank'].include?(attr_value) + yield({:type => "ParseError", + :data => "invalid-browsing-context", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name}}) + end + + def check_lang_code(token, tag_name, attr_name, attr_value) + return if !attr_value || attr_value == '' # blank is OK + if not is_valid_lang_code(attr_value) + yield( {:type => "ParseError", + :data => "invalid-lang-code", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name, + "attributeValue" => attr_value}}) + end + end + + def check_mime_type(token, tag_name, attr_name, attr_value) + # XXX needs tests + if not attr_value + yield( {:type => "ParseError", + :data => "attribute-value-can-not-be-blank", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name}}) + end + if not is_valid_mime_type(attr_value) + yield( {:type => "ParseError", + :data => "invalid-mime-type", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name, + "attributeValue" => attr_value}}) + end + end + + def check_media_query(token, tag_name, attr_name, attr_value) + # XXX + end + + def check_link_relation(token, tag_name, attr_name, attr_value) + check_token_list(tag_name, attr_name, attr_value) do |t| + yield t + end + value_list = parse_token_list(attr_value) + allowed_values = tag_name == 'link' ? @@link_rel_values : @@a_rel_values + for current_value in value_list + if !allowed_values.include?(current_value) + yield({:type => "ParseError", + :data => "invalid-rel", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name}}) + end + end + end + + def check_date_time(token, tag_name, attr_name, attr_value) + # XXX + state = 'begin' # ('begin', '... +# for c in attr_value +# if state == 'begin' => +# if SPACE_CHARACTERS.include?(c) +# continue +# elsif digits.include?(c) +# state = ... + end + + ########################################################################## + # Attribute validation + ########################################################################## + + def check_attribute_values(token) + tag_name = token.fetch(:name, "") + for attr_name, attr_value in token.fetch(:data, []) + attr_name = attr_name.downcase + method = "validate_attribute_value_#{tag_name.to_s.underscore}_#{attr_name.to_s.underscore}" + if respond_to?(method) + send(method, token, tag_name, attr_name, attr_value) do |t| + yield t + end + else + method = "validate_attribute_value_#{attr_name.to_s.underscore}" + if respond_to?(method) + send(method, token, tag_name, attr_name, attr_value) do |t| + yield t + end + end + end + end + end + + def validate_attribute_value_class(token, tag_name, attr_name, attr_value) + check_token_list(tag_name, attr_name, attr_value) do |t| + yield t + yield( {:type => "ParseError", + :data => "invalid-attribute-value", + :datavars => {"tagName" => tag_name, + "attributeName" => attr_name}}) + end + end + + def validate_attribute_value_contenteditable(token, tag_name, attr_name, attr_value) + check_enumerated_value(token, tag_name, attr_name, attr_value, ['true', 'false', '']) do |t| + yield t + end + end + + def validate_attribute_value_dir(token, tag_name, attr_name, attr_value) + check_enumerated_value(token, tag_name, attr_name, attr_value, ['ltr', 'rtl']) do |t| + yield t + end + end + + def validate_attribute_value_draggable(token, tag_name, attr_name, attr_value) + check_enumerated_value(token, tag_name, attr_name, attr_value, ['true', 'false']) do |t| + yield t + end + end + + alias validate_attribute_value_irrelevant check_boolean + alias validate_attribute_value_lang check_lang_code + + def validate_attribute_value_contextmenu(token, tag_name, attr_name, attr_value) + check_id(token, tag_name, attr_name, attr_value) do |t| + yield t + end + @things_that_point_to_an_id << token + end + + def validate_attribute_value_id(token, tag_name, attr_name, attr_value) + # This method has side effects. It adds 'token' to the list of + # things that define an ID (@things_that_define_an_id) so that we can + # later check 1) whether an ID is duplicated, and 2) whether all the + # things that point to something else by ID (like