From 8ccaad85a5389c89eace76d957fb1da9ae60fd08 Mon Sep 17 00:00:00 2001 From: Jacques Distler Date: Wed, 4 Jul 2007 17:36:59 -0500 Subject: [PATCH 1/7] Sync with latest HTML5lib and latest Maruku --- lib/sanitize.rb | 14 +- .../HTML5lib/lib/{html5lib.rb => html5.rb} | 22 +- .../plugins/HTML5lib/lib/html5/constants.rb | 817 ++++++++++++++++++ vendor/plugins/HTML5lib/lib/html5/filters.rb | 1 + .../lib/{html5lib => html5}/filters/base.rb | 2 +- .../filters/inject_meta_charset.rb | 4 +- .../filters/optionaltags.rb | 6 +- .../{html5lib => html5}/filters/sanitizer.rb | 6 +- .../{html5lib => html5}/filters/whitespace.rb | 6 +- .../lib/{html5lib => html5}/html5parser.rb | 492 +++++------ .../html5parser/after_body_phase.rb | 4 +- .../html5parser/after_frameset_phase.rb | 4 +- .../html5parser/after_head_phase.rb | 4 +- .../html5parser/before_head_phase.rb | 6 +- .../html5parser/in_body_phase.rb | 36 +- .../html5parser/in_caption_phase.rb | 4 +- .../html5parser/in_cell_phase.rb | 4 +- .../html5parser/in_column_group_phase.rb | 4 +- .../html5parser/in_frameset_phase.rb | 4 +- .../html5parser/in_head_phase.rb | 6 +- .../html5parser/in_row_phase.rb | 4 +- .../html5parser/in_select_phase.rb | 4 +- .../html5parser/in_table_body_phase.rb | 4 +- .../html5parser/in_table_phase.rb | 4 +- .../html5parser/initial_phase.rb | 4 +- .../{html5lib => html5}/html5parser/phase.rb | 2 +- .../html5parser/root_element_phase.rb | 4 +- .../html5parser/trailing_end_phase.rb | 4 +- .../lib/{html5lib => html5}/inputstream.rb | 63 +- .../{html5lib => html5}/liberalxmlparser.rb | 24 +- .../lib/{html5lib => html5}/sanitizer.rb | 3 +- .../plugins/HTML5lib/lib/html5/serializer.rb | 2 + .../serializer/htmlserializer.rb | 15 +- .../serializer/xhtmlserializer.rb | 7 +- .../lib/{html5lib => html5}/tokenizer.rb | 103 ++- .../lib/{html5lib => html5}/treebuilders.rb | 8 +- .../{html5lib => html5}/treebuilders/base.rb | 4 +- .../treebuilders/hpricot.rb | 442 +++++----- .../{html5lib => html5}/treebuilders/rexml.rb | 4 +- .../treebuilders/simpletree.rb | 4 +- .../lib/{html5lib => html5}/treewalkers.rb | 10 +- .../{html5lib => html5}/treewalkers/base.rb | 4 +- .../treewalkers/hpricot.rb | 6 +- .../{html5lib => html5}/treewalkers/rexml.rb | 6 +- .../treewalkers/simpletree.rb | 8 +- .../HTML5lib/lib/html5lib/constants.rb | 708 --------------- .../plugins/HTML5lib/lib/html5lib/filters.rb | 1 - .../HTML5lib/lib/html5lib/serializer.rb | 2 - vendor/plugins/HTML5lib/parse.rb | 24 +- .../HTML5lib/testdata/encoding/tests2.dat | 1 - .../HTML5lib/testdata/serializer/core.test | 3 +- .../HTML5lib/testdata/serializer/options.test | 6 + .../HTML5lib/testdata/tokenizer/test1.test | 16 +- .../HTML5lib/testdata/tokenizer/test2.test | 14 +- .../testdata/tree-construction/tests1.dat | 11 +- .../testdata/tree-construction/tests2.dat | 1 + .../testdata/tree-construction/tests3.dat | 30 +- .../testdata/tree-construction/tests4.dat | 28 +- .../testdata/tree-construction/tests6.dat | 38 + vendor/plugins/HTML5lib/tests/preamble.rb | 162 ++-- .../plugins/HTML5lib/tests/test_encoding.rb | 16 +- vendor/plugins/HTML5lib/tests/test_lxp.rb | 75 +- vendor/plugins/HTML5lib/tests/test_parser.rb | 25 +- .../plugins/HTML5lib/tests/test_sanitizer.rb | 14 +- .../plugins/HTML5lib/tests/test_serializer.rb | 14 +- vendor/plugins/HTML5lib/tests/test_stream.rb | 4 +- .../plugins/HTML5lib/tests/test_tokenizer.rb | 6 +- .../HTML5lib/tests/test_treewalkers.rb | 33 +- .../HTML5lib/tests/tokenizer_test_parser.rb | 126 +-- .../maruku/lib/maruku/input/linesource.rb | 1 + .../maruku/lib/maruku/input/parse_block.rb | 47 +- 71 files changed, 1974 insertions(+), 1621 deletions(-) rename vendor/plugins/HTML5lib/lib/{html5lib.rb => html5.rb} (76%) create mode 100755 vendor/plugins/HTML5lib/lib/html5/constants.rb create mode 100644 vendor/plugins/HTML5lib/lib/html5/filters.rb rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/filters/base.rb (89%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/filters/inject_meta_charset.rb (98%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/filters/optionaltags.rb (99%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/filters/sanitizer.rb (73%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/filters/whitespace.rb (90%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser.rb (93%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/after_body_phase.rb (96%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/after_frameset_phase.rb (94%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/after_head_phase.rb (95%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/before_head_phase.rb (88%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/in_body_phase.rb (94%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/in_caption_phase.rb (97%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/in_cell_phase.rb (97%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/in_column_group_phase.rb (96%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/in_frameset_phase.rb (96%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/in_head_phase.rb (96%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/in_row_phase.rb (97%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/in_select_phase.rb (97%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/in_table_body_phase.rb (97%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/in_table_phase.rb (98%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/initial_phase.rb (99%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/phase.rb (99%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/root_element_phase.rb (94%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/html5parser/trailing_end_phase.rb (94%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/inputstream.rb (90%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/liberalxmlparser.rb (87%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/sanitizer.rb (99%) create mode 100644 vendor/plugins/HTML5lib/lib/html5/serializer.rb rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/serializer/htmlserializer.rb (93%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/serializer/xhtmlserializer.rb (72%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/tokenizer.rb (93%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/treebuilders.rb (70%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/treebuilders/base.rb (99%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/treebuilders/hpricot.rb (95%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/treebuilders/rexml.rb (98%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/treebuilders/simpletree.rb (98%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/treewalkers.rb (66%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/treewalkers/base.rb (98%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/treewalkers/hpricot.rb (89%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/treewalkers/rexml.rb (89%) rename vendor/plugins/HTML5lib/lib/{html5lib => html5}/treewalkers/simpletree.rb (86%) delete mode 100755 vendor/plugins/HTML5lib/lib/html5lib/constants.rb delete mode 100644 vendor/plugins/HTML5lib/lib/html5lib/filters.rb delete mode 100644 vendor/plugins/HTML5lib/lib/html5lib/serializer.rb diff --git a/lib/sanitize.rb b/lib/sanitize.rb index c36e7583..c0773fd4 100644 --- a/lib/sanitize.rb +++ b/lib/sanitize.rb @@ -25,14 +25,14 @@ module Sanitize - require 'html5lib/html5parser' - require 'html5lib/liberalxmlparser' - require 'html5lib/treewalkers' - require 'html5lib/treebuilders' - require 'html5lib/serializer' - require 'html5lib/sanitizer' + require 'html5/html5parser' + require 'html5/liberalxmlparser' + require 'html5/treewalkers' + require 'html5/treebuilders' + require 'html5/serializer' + require 'html5/sanitizer' - include HTML5lib + include HTML5 # Sanitize a string, parsed using XHTML parsing rules. # diff --git a/vendor/plugins/HTML5lib/lib/html5lib.rb b/vendor/plugins/HTML5lib/lib/html5.rb similarity index 76% rename from vendor/plugins/HTML5lib/lib/html5lib.rb rename to vendor/plugins/HTML5lib/lib/html5.rb index b4aba9a9..bd2174f2 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib.rb +++ b/vendor/plugins/HTML5lib/lib/html5.rb @@ -1,11 +1,11 @@ -require 'html5lib/html5parser' - -module HTML5lib - def self.parse(stream, options={}) - HTMLParser.parse(stream, options) - end - - def self.parseFragment(stream, options={}) - HTMLParser.parse(stream, options) - end -end +require 'html5/html5parser' + +module HTML5 + def self.parse(stream, options={}) + HTMLParser.parse(stream, options) + end + + def self.parseFragment(stream, options={}) + HTMLParser.parse(stream, options) + end +end diff --git a/vendor/plugins/HTML5lib/lib/html5/constants.rb b/vendor/plugins/HTML5lib/lib/html5/constants.rb new file mode 100755 index 00000000..356fb836 --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5/constants.rb @@ -0,0 +1,817 @@ +module HTML5 + + class EOF < Exception; end + + CONTENT_MODEL_FLAGS = [ + :PCDATA, + :RCDATA, + :CDATA, + :PLAINTEXT + ] + + SCOPING_ELEMENTS = %w[ + button + caption + html + marquee + object + table + td + th + ] + + FORMATTING_ELEMENTS = %w[ + a + b + big + em + font + i + nobr + s + small + strike + strong + tt + u + ] + + SPECIAL_ELEMENTS = %w[ + address + area + base + basefont + bgsound + blockquote + body + br + center + col + colgroup + dd + dir + div + dl + dt + embed + fieldset + form + frame + frameset + h1 + h2 + h3 + h4 + h5 + h6 + head + hr + iframe + image + img + input + isindex + li + link + listing + menu + meta + noembed + noframes + noscript + ol + optgroup + option + p + param + plaintext + pre + script + select + spacer + style + tbody + textarea + tfoot + thead + title + tr + ul + wbr + ] + + SPACE_CHARACTERS = %W[ + \t + \n + \x0B + \x0C + \x20 + \r + ] + + TABLE_INSERT_MODE_ELEMENTS = %w[ + table + tbody + tfoot + thead + tr + ] + + ASCII_LOWERCASE = ('a'..'z').to_a.join('') + ASCII_UPPERCASE = ('A'..'Z').to_a.join('') + ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE + DIGITS = '0'..'9' + HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a + + # Heading elements need to be ordered + HEADING_ELEMENTS = %w[ + h1 + h2 + h3 + h4 + h5 + h6 + ] + + # XXX What about event-source and command? + VOID_ELEMENTS = %w[ + base + link + meta + hr + br + img + embed + param + area + col + input + ] + + CDATA_ELEMENTS = %w[title textarea] + + RCDATA_ELEMENTS = %w[ + style + script + xmp + iframe + noembed + noframes + noscript + ] + + BOOLEAN_ATTRIBUTES = { + :global => %w[irrelevant], + 'style' => %w[scoped], + 'img' => %w[ismap], + 'audio' => %w[autoplay controls], + 'video' => %w[autoplay controls], + 'script' => %w[defer async], + 'details' => %w[open], + 'datagrid' => %w[multiple disabled], + 'command' => %w[hidden disabled checked default], + 'menu' => %w[autosubmit], + 'fieldset' => %w[disabled readonly], + 'option' => %w[disabled readonly selected], + 'optgroup' => %w[disabled readonly], + 'button' => %w[disabled autofocus], + 'input' => %w[disabled readonly required autofocus checked ismap], + 'select' => %w[disabled readonly autofocus multiple], + 'output' => %w[disabled readonly] + } + + # entitiesWindows1252 has to be _ordered_ and needs to have an index. + ENTITIES_WINDOWS1252 = [ + 8364, # 0x80 0x20AC EURO SIGN + 65533, # 0x81 UNDEFINED + 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK + 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK + 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK + 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS + 8224, # 0x86 0x2020 DAGGER + 8225, # 0x87 0x2021 DOUBLE DAGGER + 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT + 8240, # 0x89 0x2030 PER MILLE SIGN + 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON + 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE + 65533, # 0x8D UNDEFINED + 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON + 65533, # 0x8F UNDEFINED + 65533, # 0x90 UNDEFINED + 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK + 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK + 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK + 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK + 8226, # 0x95 0x2022 BULLET + 8211, # 0x96 0x2013 EN DASH + 8212, # 0x97 0x2014 EM DASH + 732, # 0x98 0x02DC SMALL TILDE + 8482, # 0x99 0x2122 TRADE MARK SIGN + 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON + 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE + 65533, # 0x9D UNDEFINED + 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON + 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS + ] + + # ENTITIES was generated from Python using the following code: + # + # import constants + # entities = constants.entities.items() + # entities.sort() + # list = [ ' '.join([repr(entity), '=>', ord(value)<128 and + # repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')]) + # for entity, value in entities] + # print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }' + + ENTITIES = { + 'AElig' => "\xc3\x86", + 'AElig;' => "\xc3\x86", + 'AMP' => '&', + 'AMP;' => '&', + 'Aacute' => "\xc3\x81", + 'Aacute;' => "\xc3\x81", + 'Acirc' => "\xc3\x82", + 'Acirc;' => "\xc3\x82", + 'Agrave' => "\xc3\x80", + 'Agrave;' => "\xc3\x80", + 'Alpha;' => "\xce\x91", + 'Aring' => "\xc3\x85", + 'Aring;' => "\xc3\x85", + 'Atilde' => "\xc3\x83", + 'Atilde;' => "\xc3\x83", + 'Auml' => "\xc3\x84", + 'Auml;' => "\xc3\x84", + 'Beta;' => "\xce\x92", + 'COPY' => "\xc2\xa9", + 'COPY;' => "\xc2\xa9", + 'Ccedil' => "\xc3\x87", + 'Ccedil;' => "\xc3\x87", + 'Chi;' => "\xce\xa7", + 'Dagger;' => "\xe2\x80\xa1", + 'Delta;' => "\xce\x94", + 'ETH' => "\xc3\x90", + 'ETH;' => "\xc3\x90", + 'Eacute' => "\xc3\x89", + 'Eacute;' => "\xc3\x89", + 'Ecirc' => "\xc3\x8a", + 'Ecirc;' => "\xc3\x8a", + 'Egrave' => "\xc3\x88", + 'Egrave;' => "\xc3\x88", + 'Epsilon;' => "\xce\x95", + 'Eta;' => "\xce\x97", + 'Euml' => "\xc3\x8b", + 'Euml;' => "\xc3\x8b", + 'GT' => '>', + 'GT;' => '>', + 'Gamma;' => "\xce\x93", + 'Iacute' => "\xc3\x8d", + 'Iacute;' => "\xc3\x8d", + 'Icirc' => "\xc3\x8e", + 'Icirc;' => "\xc3\x8e", + 'Igrave' => "\xc3\x8c", + 'Igrave;' => "\xc3\x8c", + 'Iota;' => "\xce\x99", + 'Iuml' => "\xc3\x8f", + 'Iuml;' => "\xc3\x8f", + 'Kappa;' => "\xce\x9a", + 'LT' => '<', + 'LT;' => '<', + 'Lambda;' => "\xce\x9b", + 'Mu;' => "\xce\x9c", + 'Ntilde' => "\xc3\x91", + 'Ntilde;' => "\xc3\x91", + 'Nu;' => "\xce\x9d", + 'OElig;' => "\xc5\x92", + 'Oacute' => "\xc3\x93", + 'Oacute;' => "\xc3\x93", + 'Ocirc' => "\xc3\x94", + 'Ocirc;' => "\xc3\x94", + 'Ograve' => "\xc3\x92", + 'Ograve;' => "\xc3\x92", + 'Omega;' => "\xce\xa9", + 'Omicron;' => "\xce\x9f", + 'Oslash' => "\xc3\x98", + 'Oslash;' => "\xc3\x98", + 'Otilde' => "\xc3\x95", + 'Otilde;' => "\xc3\x95", + 'Ouml' => "\xc3\x96", + 'Ouml;' => "\xc3\x96", + 'Phi;' => "\xce\xa6", + 'Pi;' => "\xce\xa0", + 'Prime;' => "\xe2\x80\xb3", + 'Psi;' => "\xce\xa8", + 'QUOT' => '"', + 'QUOT;' => '"', + 'REG' => "\xc2\xae", + 'REG;' => "\xc2\xae", + 'Rho;' => "\xce\xa1", + 'Scaron;' => "\xc5\xa0", + 'Sigma;' => "\xce\xa3", + 'THORN' => "\xc3\x9e", + 'THORN;' => "\xc3\x9e", + 'TRADE;' => "\xe2\x84\xa2", + 'Tau;' => "\xce\xa4", + 'Theta;' => "\xce\x98", + 'Uacute' => "\xc3\x9a", + 'Uacute;' => "\xc3\x9a", + 'Ucirc' => "\xc3\x9b", + 'Ucirc;' => "\xc3\x9b", + 'Ugrave' => "\xc3\x99", + 'Ugrave;' => "\xc3\x99", + 'Upsilon;' => "\xce\xa5", + 'Uuml' => "\xc3\x9c", + 'Uuml;' => "\xc3\x9c", + 'Xi;' => "\xce\x9e", + 'Yacute' => "\xc3\x9d", + 'Yacute;' => "\xc3\x9d", + 'Yuml;' => "\xc5\xb8", + 'Zeta;' => "\xce\x96", + 'aacute' => "\xc3\xa1", + 'aacute;' => "\xc3\xa1", + 'acirc' => "\xc3\xa2", + 'acirc;' => "\xc3\xa2", + 'acute' => "\xc2\xb4", + 'acute;' => "\xc2\xb4", + 'aelig' => "\xc3\xa6", + 'aelig;' => "\xc3\xa6", + 'agrave' => "\xc3\xa0", + 'agrave;' => "\xc3\xa0", + 'alefsym;' => "\xe2\x84\xb5", + 'alpha;' => "\xce\xb1", + 'amp' => '&', + 'amp;' => '&', + 'and;' => "\xe2\x88\xa7", + 'ang;' => "\xe2\x88\xa0", + 'apos;' => "'", + 'aring' => "\xc3\xa5", + 'aring;' => "\xc3\xa5", + 'asymp;' => "\xe2\x89\x88", + 'atilde' => "\xc3\xa3", + 'atilde;' => "\xc3\xa3", + 'auml' => "\xc3\xa4", + 'auml;' => "\xc3\xa4", + 'bdquo;' => "\xe2\x80\x9e", + 'beta;' => "\xce\xb2", + 'brvbar' => "\xc2\xa6", + 'brvbar;' => "\xc2\xa6", + 'bull;' => "\xe2\x80\xa2", + 'cap;' => "\xe2\x88\xa9", + 'ccedil' => "\xc3\xa7", + 'ccedil;' => "\xc3\xa7", + 'cedil' => "\xc2\xb8", + 'cedil;' => "\xc2\xb8", + 'cent' => "\xc2\xa2", + 'cent;' => "\xc2\xa2", + 'chi;' => "\xcf\x87", + 'circ;' => "\xcb\x86", + 'clubs;' => "\xe2\x99\xa3", + 'cong;' => "\xe2\x89\x85", + 'copy' => "\xc2\xa9", + 'copy;' => "\xc2\xa9", + 'crarr;' => "\xe2\x86\xb5", + 'cup;' => "\xe2\x88\xaa", + 'curren' => "\xc2\xa4", + 'curren;' => "\xc2\xa4", + 'dArr;' => "\xe2\x87\x93", + 'dagger;' => "\xe2\x80\xa0", + 'darr;' => "\xe2\x86\x93", + 'deg' => "\xc2\xb0", + 'deg;' => "\xc2\xb0", + 'delta;' => "\xce\xb4", + 'diams;' => "\xe2\x99\xa6", + 'divide' => "\xc3\xb7", + 'divide;' => "\xc3\xb7", + 'eacute' => "\xc3\xa9", + 'eacute;' => "\xc3\xa9", + 'ecirc' => "\xc3\xaa", + 'ecirc;' => "\xc3\xaa", + 'egrave' => "\xc3\xa8", + 'egrave;' => "\xc3\xa8", + 'empty;' => "\xe2\x88\x85", + 'emsp;' => "\xe2\x80\x83", + 'ensp;' => "\xe2\x80\x82", + 'epsilon;' => "\xce\xb5", + 'equiv;' => "\xe2\x89\xa1", + 'eta;' => "\xce\xb7", + 'eth' => "\xc3\xb0", + 'eth;' => "\xc3\xb0", + 'euml' => "\xc3\xab", + 'euml;' => "\xc3\xab", + 'euro;' => "\xe2\x82\xac", + 'exist;' => "\xe2\x88\x83", + 'fnof;' => "\xc6\x92", + 'forall;' => "\xe2\x88\x80", + 'frac12' => "\xc2\xbd", + 'frac12;' => "\xc2\xbd", + 'frac14' => "\xc2\xbc", + 'frac14;' => "\xc2\xbc", + 'frac34' => "\xc2\xbe", + 'frac34;' => "\xc2\xbe", + 'frasl;' => "\xe2\x81\x84", + 'gamma;' => "\xce\xb3", + 'ge;' => "\xe2\x89\xa5", + 'gt' => '>', + 'gt;' => '>', + 'hArr;' => "\xe2\x87\x94", + 'harr;' => "\xe2\x86\x94", + 'hearts;' => "\xe2\x99\xa5", + 'hellip;' => "\xe2\x80\xa6", + 'iacute' => "\xc3\xad", + 'iacute;' => "\xc3\xad", + 'icirc' => "\xc3\xae", + 'icirc;' => "\xc3\xae", + 'iexcl' => "\xc2\xa1", + 'iexcl;' => "\xc2\xa1", + 'igrave' => "\xc3\xac", + 'igrave;' => "\xc3\xac", + 'image;' => "\xe2\x84\x91", + 'infin;' => "\xe2\x88\x9e", + 'int;' => "\xe2\x88\xab", + 'iota;' => "\xce\xb9", + 'iquest' => "\xc2\xbf", + 'iquest;' => "\xc2\xbf", + 'isin;' => "\xe2\x88\x88", + 'iuml' => "\xc3\xaf", + 'iuml;' => "\xc3\xaf", + 'kappa;' => "\xce\xba", + 'lArr;' => "\xe2\x87\x90", + 'lambda;' => "\xce\xbb", + 'lang;' => "\xe3\x80\x88", + 'laquo' => "\xc2\xab", + 'laquo;' => "\xc2\xab", + 'larr;' => "\xe2\x86\x90", + 'lceil;' => "\xe2\x8c\x88", + 'ldquo;' => "\xe2\x80\x9c", + 'le;' => "\xe2\x89\xa4", + 'lfloor;' => "\xe2\x8c\x8a", + 'lowast;' => "\xe2\x88\x97", + 'loz;' => "\xe2\x97\x8a", + 'lrm;' => "\xe2\x80\x8e", + 'lsaquo;' => "\xe2\x80\xb9", + 'lsquo;' => "\xe2\x80\x98", + 'lt' => '<', + 'lt;' => '<', + 'macr' => "\xc2\xaf", + 'macr;' => "\xc2\xaf", + 'mdash;' => "\xe2\x80\x94", + 'micro' => "\xc2\xb5", + 'micro;' => "\xc2\xb5", + 'middot' => "\xc2\xb7", + 'middot;' => "\xc2\xb7", + 'minus;' => "\xe2\x88\x92", + 'mu;' => "\xce\xbc", + 'nabla;' => "\xe2\x88\x87", + 'nbsp' => "\xc2\xa0", + 'nbsp;' => "\xc2\xa0", + 'ndash;' => "\xe2\x80\x93", + 'ne;' => "\xe2\x89\xa0", + 'ni;' => "\xe2\x88\x8b", + 'not' => "\xc2\xac", + 'not;' => "\xc2\xac", + 'notin;' => "\xe2\x88\x89", + 'nsub;' => "\xe2\x8a\x84", + 'ntilde' => "\xc3\xb1", + 'ntilde;' => "\xc3\xb1", + 'nu;' => "\xce\xbd", + 'oacute' => "\xc3\xb3", + 'oacute;' => "\xc3\xb3", + 'ocirc' => "\xc3\xb4", + 'ocirc;' => "\xc3\xb4", + 'oelig;' => "\xc5\x93", + 'ograve' => "\xc3\xb2", + 'ograve;' => "\xc3\xb2", + 'oline;' => "\xe2\x80\xbe", + 'omega;' => "\xcf\x89", + 'omicron;' => "\xce\xbf", + 'oplus;' => "\xe2\x8a\x95", + 'or;' => "\xe2\x88\xa8", + 'ordf' => "\xc2\xaa", + 'ordf;' => "\xc2\xaa", + 'ordm' => "\xc2\xba", + 'ordm;' => "\xc2\xba", + 'oslash' => "\xc3\xb8", + 'oslash;' => "\xc3\xb8", + 'otilde' => "\xc3\xb5", + 'otilde;' => "\xc3\xb5", + 'otimes;' => "\xe2\x8a\x97", + 'ouml' => "\xc3\xb6", + 'ouml;' => "\xc3\xb6", + 'para' => "\xc2\xb6", + 'para;' => "\xc2\xb6", + 'part;' => "\xe2\x88\x82", + 'permil;' => "\xe2\x80\xb0", + 'perp;' => "\xe2\x8a\xa5", + 'phi;' => "\xcf\x86", + 'pi;' => "\xcf\x80", + 'piv;' => "\xcf\x96", + 'plusmn' => "\xc2\xb1", + 'plusmn;' => "\xc2\xb1", + 'pound' => "\xc2\xa3", + 'pound;' => "\xc2\xa3", + 'prime;' => "\xe2\x80\xb2", + 'prod;' => "\xe2\x88\x8f", + 'prop;' => "\xe2\x88\x9d", + 'psi;' => "\xcf\x88", + 'quot' => '"', + 'quot;' => '"', + 'rArr;' => "\xe2\x87\x92", + 'radic;' => "\xe2\x88\x9a", + 'rang;' => "\xe3\x80\x89", + 'raquo' => "\xc2\xbb", + 'raquo;' => "\xc2\xbb", + 'rarr;' => "\xe2\x86\x92", + 'rceil;' => "\xe2\x8c\x89", + 'rdquo;' => "\xe2\x80\x9d", + 'real;' => "\xe2\x84\x9c", + 'reg' => "\xc2\xae", + 'reg;' => "\xc2\xae", + 'rfloor;' => "\xe2\x8c\x8b", + 'rho;' => "\xcf\x81", + 'rlm;' => "\xe2\x80\x8f", + 'rsaquo;' => "\xe2\x80\xba", + 'rsquo;' => "\xe2\x80\x99", + 'sbquo;' => "\xe2\x80\x9a", + 'scaron;' => "\xc5\xa1", + 'sdot;' => "\xe2\x8b\x85", + 'sect' => "\xc2\xa7", + 'sect;' => "\xc2\xa7", + 'shy' => "\xc2\xad", + 'shy;' => "\xc2\xad", + 'sigma;' => "\xcf\x83", + 'sigmaf;' => "\xcf\x82", + 'sim;' => "\xe2\x88\xbc", + 'spades;' => "\xe2\x99\xa0", + 'sub;' => "\xe2\x8a\x82", + 'sube;' => "\xe2\x8a\x86", + 'sum;' => "\xe2\x88\x91", + 'sup1' => "\xc2\xb9", + 'sup1;' => "\xc2\xb9", + 'sup2' => "\xc2\xb2", + 'sup2;' => "\xc2\xb2", + 'sup3' => "\xc2\xb3", + 'sup3;' => "\xc2\xb3", + 'sup;' => "\xe2\x8a\x83", + 'supe;' => "\xe2\x8a\x87", + 'szlig' => "\xc3\x9f", + 'szlig;' => "\xc3\x9f", + 'tau;' => "\xcf\x84", + 'there4;' => "\xe2\x88\xb4", + 'theta;' => "\xce\xb8", + 'thetasym;' => "\xcf\x91", + 'thinsp;' => "\xe2\x80\x89", + 'thorn' => "\xc3\xbe", + 'thorn;' => "\xc3\xbe", + 'tilde;' => "\xcb\x9c", + 'times' => "\xc3\x97", + 'times;' => "\xc3\x97", + 'trade;' => "\xe2\x84\xa2", + 'uArr;' => "\xe2\x87\x91", + 'uacute' => "\xc3\xba", + 'uacute;' => "\xc3\xba", + 'uarr;' => "\xe2\x86\x91", + 'ucirc' => "\xc3\xbb", + 'ucirc;' => "\xc3\xbb", + 'ugrave' => "\xc3\xb9", + 'ugrave;' => "\xc3\xb9", + 'uml' => "\xc2\xa8", + 'uml;' => "\xc2\xa8", + 'upsih;' => "\xcf\x92", + 'upsilon;' => "\xcf\x85", + 'uuml' => "\xc3\xbc", + 'uuml;' => "\xc3\xbc", + 'weierp;' => "\xe2\x84\x98", + 'xi;' => "\xce\xbe", + 'yacute' => "\xc3\xbd", + 'yacute;' => "\xc3\xbd", + 'yen' => "\xc2\xa5", + 'yen;' => "\xc2\xa5", + 'yuml' => "\xc3\xbf", + 'yuml;' => "\xc3\xbf", + 'zeta;' => "\xce\xb6", + 'zwj;' => "\xe2\x80\x8d", + 'zwnj;' => "\xe2\x80\x8c" + } + + ENCODINGS = %w[ + ansi_x3.4-1968 + iso-ir-6 + ansi_x3.4-1986 + iso_646.irv:1991 + ascii + iso646-us + us-ascii + us + ibm367 + cp367 + csascii + ks_c_5601-1987 + korean + iso-2022-kr + csiso2022kr + euc-kr + iso-2022-jp + csiso2022jp + iso-2022-jp-2 + iso-ir-58 + chinese + csiso58gb231280 + iso_8859-1:1987 + iso-ir-100 + iso_8859-1 + iso-8859-1 + latin1 + l1 + ibm819 + cp819 + csisolatin1 + iso_8859-2:1987 + iso-ir-101 + iso_8859-2 + iso-8859-2 + latin2 + l2 + csisolatin2 + iso_8859-3:1988 + iso-ir-109 + iso_8859-3 + iso-8859-3 + latin3 + l3 + csisolatin3 + iso_8859-4:1988 + iso-ir-110 + iso_8859-4 + iso-8859-4 + latin4 + l4 + csisolatin4 + iso_8859-6:1987 + iso-ir-127 + iso_8859-6 + iso-8859-6 + ecma-114 + asmo-708 + arabic + csisolatinarabic + iso_8859-7:1987 + iso-ir-126 + iso_8859-7 + iso-8859-7 + elot_928 + ecma-118 + greek + greek8 + csisolatingreek + iso_8859-8:1988 + iso-ir-138 + iso_8859-8 + iso-8859-8 + hebrew + csisolatinhebrew + iso_8859-5:1988 + iso-ir-144 + iso_8859-5 + iso-8859-5 + cyrillic + csisolatincyrillic + iso_8859-9:1989 + iso-ir-148 + iso_8859-9 + iso-8859-9 + latin5 + l5 + csisolatin5 + iso-8859-10 + iso-ir-157 + l6 + iso_8859-10:1992 + csisolatin6 + latin6 + hp-roman8 + roman8 + r8 + ibm037 + cp037 + csibm037 + ibm424 + cp424 + csibm424 + ibm437 + cp437 + 437 + cspc8codepage437 + ibm500 + cp500 + csibm500 + ibm775 + cp775 + cspc775baltic + ibm850 + cp850 + 850 + cspc850multilingual + ibm852 + cp852 + 852 + cspcp852 + ibm855 + cp855 + 855 + csibm855 + ibm857 + cp857 + 857 + csibm857 + ibm860 + cp860 + 860 + csibm860 + ibm861 + cp861 + 861 + cp-is + csibm861 + ibm862 + cp862 + 862 + cspc862latinhebrew + ibm863 + cp863 + 863 + csibm863 + ibm864 + cp864 + csibm864 + ibm865 + cp865 + 865 + csibm865 + ibm866 + cp866 + 866 + csibm866 + ibm869 + cp869 + 869 + cp-gr + csibm869 + ibm1026 + cp1026 + csibm1026 + koi8-r + cskoi8r + koi8-u + big5-hkscs + ptcp154 + csptcp154 + pt154 + cp154 + utf-7 + utf-16be + utf-16le + utf-16 + utf-8 + iso-8859-13 + iso-8859-14 + iso-ir-199 + iso_8859-14:1998 + iso_8859-14 + latin8 + iso-celtic + l8 + iso-8859-15 + iso_8859-15 + iso-8859-16 + iso-ir-226 + iso_8859-16:2001 + iso_8859-16 + latin10 + l10 + gbk + cp936 + ms936 + gb18030 + shift_jis + ms_kanji + csshiftjis + euc-jp + gb2312 + big5 + csbig5 + windows-1250 + windows-1251 + windows-1252 + windows-1253 + windows-1254 + windows-1255 + windows-1256 + windows-1257 + windows-1258 + tis-620 + hz-gb-2312 + ] + +end diff --git a/vendor/plugins/HTML5lib/lib/html5/filters.rb b/vendor/plugins/HTML5lib/lib/html5/filters.rb new file mode 100644 index 00000000..74c7f0e0 --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5/filters.rb @@ -0,0 +1 @@ +require 'html5/filters/optionaltags' diff --git a/vendor/plugins/HTML5lib/lib/html5lib/filters/base.rb b/vendor/plugins/HTML5lib/lib/html5/filters/base.rb similarity index 89% rename from vendor/plugins/HTML5lib/lib/html5lib/filters/base.rb rename to vendor/plugins/HTML5lib/lib/html5/filters/base.rb index c1a5c660..0cb023d2 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/filters/base.rb +++ b/vendor/plugins/HTML5lib/lib/html5/filters/base.rb @@ -1,7 +1,7 @@ require 'delegate' require 'enumerator' -module HTML5lib +module HTML5 module Filters class Base < SimpleDelegator include Enumerable diff --git a/vendor/plugins/HTML5lib/lib/html5lib/filters/inject_meta_charset.rb b/vendor/plugins/HTML5lib/lib/html5/filters/inject_meta_charset.rb similarity index 98% rename from vendor/plugins/HTML5lib/lib/html5lib/filters/inject_meta_charset.rb rename to vendor/plugins/HTML5lib/lib/html5/filters/inject_meta_charset.rb index 00dc980d..b2cf93a5 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/filters/inject_meta_charset.rb +++ b/vendor/plugins/HTML5lib/lib/html5/filters/inject_meta_charset.rb @@ -1,6 +1,6 @@ -require 'html5lib/filters/base' +require 'html5/filters/base' -module HTML5lib +module HTML5 module Filters class InjectMetaCharset < Base def initialize(source, encoding) diff --git a/vendor/plugins/HTML5lib/lib/html5lib/filters/optionaltags.rb b/vendor/plugins/HTML5lib/lib/html5/filters/optionaltags.rb similarity index 99% rename from vendor/plugins/HTML5lib/lib/html5lib/filters/optionaltags.rb rename to vendor/plugins/HTML5lib/lib/html5/filters/optionaltags.rb index aacf3b73..37d2e29b 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/filters/optionaltags.rb +++ b/vendor/plugins/HTML5lib/lib/html5/filters/optionaltags.rb @@ -1,7 +1,7 @@ -require 'html5lib/constants' -require 'html5lib/filters/base' +require 'html5/constants' +require 'html5/filters/base' -module HTML5lib +module HTML5 module Filters class OptionalTagFilter < Base diff --git a/vendor/plugins/HTML5lib/lib/html5lib/filters/sanitizer.rb b/vendor/plugins/HTML5lib/lib/html5/filters/sanitizer.rb similarity index 73% rename from vendor/plugins/HTML5lib/lib/html5lib/filters/sanitizer.rb rename to vendor/plugins/HTML5lib/lib/html5/filters/sanitizer.rb index db9a12e0..8e25f594 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/filters/sanitizer.rb +++ b/vendor/plugins/HTML5lib/lib/html5/filters/sanitizer.rb @@ -1,7 +1,7 @@ -require 'html5lib/filters/base' -require 'html5lib/sanitizer' +require 'html5/filters/base' +require 'html5/sanitizer' -module HTML5lib +module HTML5 module Filters class HTMLSanitizeFilter < Base include HTMLSanitizeModule diff --git a/vendor/plugins/HTML5lib/lib/html5lib/filters/whitespace.rb b/vendor/plugins/HTML5lib/lib/html5/filters/whitespace.rb similarity index 90% rename from vendor/plugins/HTML5lib/lib/html5lib/filters/whitespace.rb rename to vendor/plugins/HTML5lib/lib/html5/filters/whitespace.rb index 3b85fd7b..b1d17190 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/filters/whitespace.rb +++ b/vendor/plugins/HTML5lib/lib/html5/filters/whitespace.rb @@ -1,7 +1,7 @@ -require 'html5lib/constants' -require 'html5lib/filters/base' +require 'html5/constants' +require 'html5/filters/base' -module HTML5lib +module HTML5 module Filters class WhitespaceFilter < Base diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser.rb similarity index 93% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser.rb index bf48930a..b755bee1 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser.rb @@ -1,246 +1,246 @@ -require 'html5lib/constants' -require 'html5lib/tokenizer' -require 'html5lib/treebuilders/rexml' - -Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path| - require 'html5lib/html5parser/' + File.basename(path) -end - -module HTML5lib - - # Error in parsed document - class ParseError < Exception; end - class AssertionError < Exception; end - - # HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML - # - class HTMLParser - - attr_accessor :phase, :firstStartTag, :innerHTML, :lastPhase, :insertFromTable - - attr_reader :phases, :tokenizer, :tree, :errors - - def self.parse(stream, options = {}) - encoding = options.delete(:encoding) - new(options).parse(stream,encoding) - end - - def self.parseFragment(stream, options = {}) - container = options.delete(:container) || 'div' - encoding = options.delete(:encoding) - new(options).parseFragment(stream,container,encoding) - end - - @@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption - inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd ) - - # :strict - raise an exception when a parse error is encountered - # :tree - a treebuilder class controlling the type of tree that will be - # returned. Built in treebuilders can be accessed through - # HTML5lib::TreeBuilders[treeType] - def initialize(options = {}) - @strict = false - @errors = [] - - @tokenizer = HTMLTokenizer - @tree = TreeBuilders::REXML::TreeBuilder - - options.each { |name, value| instance_variable_set("@#{name}", value) } - - @tree = @tree.new - - @phases = @@phases.inject({}) do |phases, phase_name| - phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase' - phases[phase_name.to_sym] = HTML5lib.const_get(phase_class_name).new(self, @tree) - phases - end - end - - def _parse(stream, innerHTML, encoding, container = 'div') - @tree.reset - @firstStartTag = false - @errors = [] - - @tokenizer = @tokenizer.class unless Class === @tokenizer - @tokenizer = @tokenizer.new(stream, :encoding => encoding, - :parseMeta => !innerHTML) - - if innerHTML - case @innerHTML = container.downcase - when 'title', 'textarea' - @tokenizer.contentModelFlag = :RCDATA - when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript' - @tokenizer.contentModelFlag = :CDATA - when 'plaintext' - @tokenizer.contentModelFlag = :PLAINTEXT - else - # contentModelFlag already is PCDATA - #@tokenizer.contentModelFlag = :PCDATA - end - - @phase = @phases[:rootElement] - @phase.insertHtmlElement - resetInsertionMode - else - @innerHTML = false - @phase = @phases[:initial] - end - - # We only seem to have InBodyPhase testcases where the following is - # relevant ... need others too - @lastPhase = nil - - # XXX This is temporary for the moment so there isn't any other - # changes needed for the parser to work with the iterable tokenizer - @tokenizer.each do |token| - token = normalizeToken(token) - - method = 'process%s' % token[:type] - - case token[:type] - when :Characters, :SpaceCharacters, :Comment - @phase.send method, token[:data] - when :StartTag - @phase.send method, token[:name], token[:data] - when :EndTag - @phase.send method, token[:name] - when :Doctype - @phase.send method, token[:name], token[:publicId], - token[:systemId], token[:correct] - else - parseError(token[:data]) - end - end - - # When the loop finishes it's EOF - @phase.processEOF - end - - # Parse a HTML document into a well-formed tree - # - # stream - a filelike object or string containing the HTML to be parsed - # - # The optional encoding parameter must be a string that indicates - # the encoding. If specified, that encoding will be used, - # regardless of any BOM or later declaration (such as in a meta - # element) - def parse(stream, encoding=nil) - _parse(stream, false, encoding) - return @tree.getDocument - end - - # Parse a HTML fragment into a well-formed tree fragment - - # container - name of the element we're setting the innerHTML property - # if set to nil, default to 'div' - # - # stream - a filelike object or string containing the HTML to be parsed - # - # The optional encoding parameter must be a string that indicates - # the encoding. If specified, that encoding will be used, - # regardless of any BOM or later declaration (such as in a meta - # element) - def parseFragment(stream, container='div', encoding=nil) - _parse(stream, true, encoding, container) - return @tree.getFragment - end - - def parseError(data = 'XXX ERROR MESSAGE NEEDED') - # XXX The idea is to make data mandatory. - @errors.push([@tokenizer.stream.position, data]) - raise ParseError if @strict - end - - # HTML5 specific normalizations to the token stream - def normalizeToken(token) - - if token[:type] == :EmptyTag - # When a solidus (/) is encountered within a tag name what happens - # depends on whether the current tag name matches that of a void - # element. If it matches a void element atheists did the wrong - # thing and if it doesn't it's wrong for everyone. - - unless VOID_ELEMENTS.include?(token[:name]) - parseError(_('Solidus (/) incorrectly placed in tag.')) - end - - token[:type] = :StartTag - end - - if token[:type] == :StartTag - token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE) - - # We need to remove the duplicate attributes and convert attributes - # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} - - unless token[:data].empty? - data = token[:data].reverse.map { |attr, value| [attr.tr(ASCII_UPPERCASE, ASCII_LOWERCASE), value] } - token[:data] = Hash[*data.flatten] - end - - elsif token[:type] == :EndTag - parseError(_('End tag contains unexpected attributes.')) unless token[:data].empty? - token[:name] = token[:name].downcase - end - - return token - end - - @@new_modes = { - 'select' => :inSelect, - 'td' => :inCell, - 'th' => :inCell, - 'tr' => :inRow, - 'tbody' => :inTableBody, - 'thead' => :inTableBody, - 'tfoot' => :inTableBody, - 'caption' => :inCaption, - 'colgroup' => :inColumnGroup, - 'table' => :inTable, - 'head' => :inBody, - 'body' => :inBody, - 'frameset' => :inFrameset - } - - def resetInsertionMode - # The name of this method is mostly historical. (It's also used in the - # specification.) - last = false - - @tree.openElements.reverse.each do |node| - nodeName = node.name - - if node == @tree.openElements[0] - last = true - unless ['td', 'th'].include?(nodeName) - # XXX - # assert @innerHTML - nodeName = @innerHTML - end - end - - # Check for conditions that should only happen in the innerHTML - # case - if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName) - # XXX - # assert @innerHTML - end - - if @@new_modes.has_key?(nodeName) - @phase = @phases[@@new_modes[nodeName]] - elsif nodeName == 'html' - @phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead] - elsif last - @phase = @phases[:inBody] - else - next - end - - break - end - end - - def _(string); string; end - end - -end +require 'html5/constants' +require 'html5/tokenizer' +require 'html5/treebuilders/rexml' + +Dir.glob(File.join(File.dirname(__FILE__), 'html5parser', '*_phase.rb')).each do |path| + require 'html5/html5parser/' + File.basename(path) +end + +module HTML5 + + # Error in parsed document + class ParseError < Exception; end + class AssertionError < Exception; end + + # HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML + # + class HTMLParser + + attr_accessor :phase, :firstStartTag, :innerHTML, :lastPhase, :insertFromTable + + attr_reader :phases, :tokenizer, :tree, :errors + + def self.parse(stream, options = {}) + encoding = options.delete(:encoding) + new(options).parse(stream,encoding) + end + + def self.parseFragment(stream, options = {}) + container = options.delete(:container) || 'div' + encoding = options.delete(:encoding) + new(options).parseFragment(stream,container,encoding) + end + + @@phases = %w( initial rootElement beforeHead inHead afterHead inBody inTable inCaption + inColumnGroup inTableBody inRow inCell inSelect afterBody inFrameset afterFrameset trailingEnd ) + + # :strict - raise an exception when a parse error is encountered + # :tree - a treebuilder class controlling the type of tree that will be + # returned. Built in treebuilders can be accessed through + # HTML5::TreeBuilders[treeType] + def initialize(options = {}) + @strict = false + @errors = [] + + @tokenizer = HTMLTokenizer + @tree = TreeBuilders::REXML::TreeBuilder + + options.each { |name, value| instance_variable_set("@#{name}", value) } + + @tree = @tree.new + + @phases = @@phases.inject({}) do |phases, phase_name| + phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase' + phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree) + phases + end + end + + def _parse(stream, innerHTML, encoding, container = 'div') + @tree.reset + @firstStartTag = false + @errors = [] + + @tokenizer = @tokenizer.class unless Class === @tokenizer + @tokenizer = @tokenizer.new(stream, :encoding => encoding, + :parseMeta => !innerHTML) + + if innerHTML + case @innerHTML = container.downcase + when 'title', 'textarea' + @tokenizer.contentModelFlag = :RCDATA + when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript' + @tokenizer.contentModelFlag = :CDATA + when 'plaintext' + @tokenizer.contentModelFlag = :PLAINTEXT + else + # contentModelFlag already is PCDATA + #@tokenizer.contentModelFlag = :PCDATA + end + + @phase = @phases[:rootElement] + @phase.insertHtmlElement + resetInsertionMode + else + @innerHTML = false + @phase = @phases[:initial] + end + + # We only seem to have InBodyPhase testcases where the following is + # relevant ... need others too + @lastPhase = nil + + # XXX This is temporary for the moment so there isn't any other + # changes needed for the parser to work with the iterable tokenizer + @tokenizer.each do |token| + token = normalizeToken(token) + + method = 'process%s' % token[:type] + + case token[:type] + when :Characters, :SpaceCharacters, :Comment + @phase.send method, token[:data] + when :StartTag + @phase.send method, token[:name], token[:data] + when :EndTag + @phase.send method, token[:name] + when :Doctype + @phase.send method, token[:name], token[:publicId], + token[:systemId], token[:correct] + else + parseError(token[:data]) + end + end + + # When the loop finishes it's EOF + @phase.processEOF + end + + # Parse a HTML document into a well-formed tree + # + # stream - a filelike object or string containing the HTML to be parsed + # + # The optional encoding parameter must be a string that indicates + # the encoding. If specified, that encoding will be used, + # regardless of any BOM or later declaration (such as in a meta + # element) + def parse(stream, encoding=nil) + _parse(stream, false, encoding) + return @tree.getDocument + end + + # Parse a HTML fragment into a well-formed tree fragment + + # container - name of the element we're setting the innerHTML property + # if set to nil, default to 'div' + # + # stream - a filelike object or string containing the HTML to be parsed + # + # The optional encoding parameter must be a string that indicates + # the encoding. If specified, that encoding will be used, + # regardless of any BOM or later declaration (such as in a meta + # element) + def parseFragment(stream, container='div', encoding=nil) + _parse(stream, true, encoding, container) + return @tree.getFragment + end + + def parseError(data = 'XXX ERROR MESSAGE NEEDED') + # XXX The idea is to make data mandatory. + @errors.push([@tokenizer.stream.position, data]) + raise ParseError if @strict + end + + # HTML5 specific normalizations to the token stream + def normalizeToken(token) + + if token[:type] == :EmptyTag + # When a solidus (/) is encountered within a tag name what happens + # depends on whether the current tag name matches that of a void + # element. If it matches a void element atheists did the wrong + # thing and if it doesn't it's wrong for everyone. + + unless VOID_ELEMENTS.include?(token[:name]) + parseError(_('Solidus (/) incorrectly placed in tag.')) + end + + token[:type] = :StartTag + end + + if token[:type] == :StartTag + token[:name] = token[:name].tr(ASCII_UPPERCASE,ASCII_LOWERCASE) + + # We need to remove the duplicate attributes and convert attributes + # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} + + unless token[:data].empty? + data = token[:data].reverse.map { |attr, value| [attr.tr(ASCII_UPPERCASE, ASCII_LOWERCASE), value] } + token[:data] = Hash[*data.flatten] + end + + elsif token[:type] == :EndTag + parseError(_('End tag contains unexpected attributes.')) unless token[:data].empty? + token[:name] = token[:name].downcase + end + + return token + end + + @@new_modes = { + 'select' => :inSelect, + 'td' => :inCell, + 'th' => :inCell, + 'tr' => :inRow, + 'tbody' => :inTableBody, + 'thead' => :inTableBody, + 'tfoot' => :inTableBody, + 'caption' => :inCaption, + 'colgroup' => :inColumnGroup, + 'table' => :inTable, + 'head' => :inBody, + 'body' => :inBody, + 'frameset' => :inFrameset + } + + def resetInsertionMode + # The name of this method is mostly historical. (It's also used in the + # specification.) + last = false + + @tree.openElements.reverse.each do |node| + nodeName = node.name + + if node == @tree.openElements[0] + last = true + unless ['td', 'th'].include?(nodeName) + # XXX + # assert @innerHTML + nodeName = @innerHTML + end + end + + # Check for conditions that should only happen in the innerHTML + # case + if ['select', 'colgroup', 'head', 'frameset'].include?(nodeName) + # XXX + # assert @innerHTML + end + + if @@new_modes.has_key?(nodeName) + @phase = @phases[@@new_modes[nodeName]] + elsif nodeName == 'html' + @phase = @phases[@tree.headPointer.nil?? :beforeHead : :afterHead] + elsif last + @phase = @phases[:inBody] + else + next + end + + break + end + end + + def _(string); string; end + end + +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/after_body_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/after_body_phase.rb similarity index 96% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/after_body_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/after_body_phase.rb index 27778ef1..b68a0af2 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/after_body_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/after_body_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class AfterBodyPhase < Phase handle_end 'html' diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/after_frameset_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/after_frameset_phase.rb similarity index 94% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/after_frameset_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/after_frameset_phase.rb index 376c5f38..7c97bf43 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/after_frameset_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/after_frameset_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class AfterFramesetPhase < Phase # http://www.whatwg.org/specs/web-apps/current-work/#after3 diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/after_head_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/after_head_phase.rb similarity index 95% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/after_head_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/after_head_phase.rb index 37c8bf6b..082219e1 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/after_head_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/after_head_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class AfterHeadPhase < Phase handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead' diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/before_head_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/before_head_phase.rb similarity index 88% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/before_head_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/before_head_phase.rb index 98a9d023..6452dd02 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/before_head_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/before_head_phase.rb @@ -1,11 +1,11 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class BeforeHeadPhase < Phase handle_start 'html', 'head' - handle_end %w( html head body br ) => 'ImplyHead' + handle_end %w( html head body br p ) => 'ImplyHead' def processEOF startTagHead('head', {}) diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_body_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_body_phase.rb similarity index 94% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_body_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/in_body_phase.rb index 57720292..306efb05 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_body_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_body_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class InBodyPhase < Phase # http://www.whatwg.org/specs/web-apps/current-work/#in-body @@ -112,7 +112,7 @@ module HTML5lib def startTagForm(name, attributes) if @tree.formPointer - @parser.parseError('Unexpected start tag (form). Ignored.') + @parser.parseError(_('Unexpected start tag (form). Ignored.')) else endTagP('p') if in_scope?('p') @tree.insertElement(name, attributes) @@ -129,9 +129,9 @@ module HTML5lib if stopName.include?(node.name) poppedNodes = (0..i).collect { @tree.openElements.pop } if i >= 1 - @parser.parseError("Missing end tag%s (%s)" % [ + @parser.parseError(_("Missing end tag%s (%s)" % [ (i>1 ? 's' : ''), - poppedNodes.reverse.map {|item| item.name}.join(', ')]) + poppedNodes.reverse.map {|item| item.name}.join(', ')])) end break end @@ -251,7 +251,7 @@ module HTML5lib end def startTagIsindex(name, attributes) - @parser.parseError("Unexpected start tag isindex. Don't use it!") + @parser.parseError(_("Unexpected start tag isindex. Don't use it!")) return if @tree.formPointer processStartTag('form', {}) processStartTag('hr', {}) @@ -311,8 +311,13 @@ module HTML5lib def endTagP(name) @tree.generateImpliedEndTags('p') if in_scope?('p') - @parser.parseError('Unexpected end tag (p).') unless @tree.openElements[-1].name == 'p' - @tree.openElements.pop while in_scope?('p') + @parser.parseError(_('Unexpected end tag (p).')) unless @tree.openElements[-1].name == 'p' + if in_scope?('p') + @tree.openElements.pop while in_scope?('p') + else + startTagCloseP('p', {}) + endTagP('p') + end end def endTagBody(name) @@ -342,7 +347,7 @@ module HTML5lib @tree.generateImpliedEndTags if in_scope?(name) unless @tree.openElements[-1].name == name - @parser.parseError(("End tag (#{name}) seen too early. Expected other end tag.")) + @parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag.")) end if in_scope?(name) @@ -351,7 +356,14 @@ module HTML5lib end def endTagForm(name) - endTagBlock(name) + if in_scope?(name) + @tree.generateImpliedEndTags + end + if @tree.openElements[-1].name != name + @parser.parseError(_("End tag (form) seen too early. Ignored.")) + else + @tree.openElements.pop + end @tree.formPointer = nil end @@ -361,7 +373,7 @@ module HTML5lib @tree.generateImpliedEndTags(name) unless @tree.openElements[-1].name == name - @parser.parseError(("End tag (#{name}) seen too early. Expected other end tag.")) + @parser.parseError(_("End tag (#{name}) seen too early. Expected other end tag.")) end end @@ -377,7 +389,7 @@ module HTML5lib end unless @tree.openElements[-1].name == name - @parser.parseError(("Unexpected end tag (#{name}). Expected other end tag.")) + @parser.parseError(_("Unexpected end tag (#{name}). Expected other end tag.")) end HEADING_ELEMENTS.each do |element| diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_caption_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_caption_phase.rb similarity index 97% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_caption_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/in_caption_phase.rb index ccdfcb91..bbafdcd8 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_caption_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_caption_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class InCaptionPhase < Phase # http://www.whatwg.org/specs/web-apps/current-work/#in-caption diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_cell_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_cell_phase.rb similarity index 97% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_cell_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/in_cell_phase.rb index 5b88a30b..24fdf28e 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_cell_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_cell_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class InCellPhase < Phase # http://www.whatwg.org/specs/web-apps/current-work/#in-cell diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_column_group_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_column_group_phase.rb similarity index 96% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_column_group_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/in_column_group_phase.rb index 7729eb83..e257bb17 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_column_group_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_column_group_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class InColumnGroupPhase < Phase # http://www.whatwg.org/specs/web-apps/current-work/#in-column diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_frameset_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_frameset_phase.rb similarity index 96% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_frameset_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/in_frameset_phase.rb index d6c7400c..0a9b4b29 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_frameset_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_frameset_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class InFramesetPhase < Phase # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_head_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_head_phase.rb similarity index 96% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_head_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/in_head_phase.rb index 20b37653..d16205f1 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_head_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_head_phase.rb @@ -1,12 +1,12 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class InHeadPhase < Phase handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta ) handle_end 'head' - handle_end %w( html body br ) => 'ImplyAfterHead' + handle_end %w( html body br p ) => 'ImplyAfterHead' handle_end %w( title style script ) def processEOF diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_row_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_row_phase.rb similarity index 97% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_row_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/in_row_phase.rb index b3ffa3f0..b8e4640a 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_row_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_row_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class InRowPhase < Phase # http://www.whatwg.org/specs/web-apps/current-work/#in-row diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_select_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_select_phase.rb similarity index 97% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_select_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/in_select_phase.rb index 850b8f9f..8c54996f 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_select_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_select_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class InSelectPhase < Phase # http://www.whatwg.org/specs/web-apps/current-work/#in-select diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_body_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_table_body_phase.rb similarity index 97% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_body_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/in_table_body_phase.rb index 79448216..6e998dab 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_body_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_table_body_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class InTableBodyPhase < Phase # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_table_phase.rb similarity index 98% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/in_table_phase.rb index be38c53e..9adaf2ad 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/in_table_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class InTablePhase < Phase # http://www.whatwg.org/specs/web-apps/current-work/#in-table diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/initial_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/initial_phase.rb similarity index 99% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/initial_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/initial_phase.rb index aeb0afdd..392a69cd 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/initial_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/initial_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class InitialPhase < Phase # This phase deals with error handling as well which is currently not diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/phase.rb similarity index 99% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/phase.rb index d451eb37..b4bd11e1 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/phase.rb @@ -1,4 +1,4 @@ -module HTML5lib +module HTML5 # Base class for helper objects that implement each phase of processing. # # Handler methods should be in the following order (they can be omitted): diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/root_element_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/root_element_phase.rb similarity index 94% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/root_element_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/root_element_phase.rb index 7a4b67c8..437f8812 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/root_element_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/root_element_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class RootElementPhase < Phase def processEOF diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/trailing_end_phase.rb b/vendor/plugins/HTML5lib/lib/html5/html5parser/trailing_end_phase.rb similarity index 94% rename from vendor/plugins/HTML5lib/lib/html5lib/html5parser/trailing_end_phase.rb rename to vendor/plugins/HTML5lib/lib/html5/html5parser/trailing_end_phase.rb index f8f8d33a..74b1dda9 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/trailing_end_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5/html5parser/trailing_end_phase.rb @@ -1,6 +1,6 @@ -require 'html5lib/html5parser/phase' +require 'html5/html5parser/phase' -module HTML5lib +module HTML5 class TrailingEndPhase < Phase def processEOF diff --git a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb b/vendor/plugins/HTML5lib/lib/html5/inputstream.rb similarity index 90% rename from vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb rename to vendor/plugins/HTML5lib/lib/html5/inputstream.rb index 3abb5b67..94368d00 100755 --- a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb +++ b/vendor/plugins/HTML5lib/lib/html5/inputstream.rb @@ -1,7 +1,7 @@ require 'stringio' -require 'html5lib/constants' +require 'html5/constants' -module HTML5lib +module HTML5 # Provides a unicode stream of characters to the HTMLTokenizer. @@ -10,7 +10,7 @@ module HTML5lib class HTMLInputStream - attr_accessor :queue, :char_encoding + attr_accessor :queue, :char_encoding, :errors # Initialises the HTMLInputStream. # @@ -40,25 +40,31 @@ module HTML5lib #Number of bytes to use when looking for a meta element with #encoding information @NUM_BYTES_META = 512 + #Number of bytes to use when using detecting encoding using chardet + @NUM_BYTES_CHARDET = 256 + #Number of bytes to use when reading content + @NUM_BYTES_BUFFER = 1024 + #Encoding to use if no other information can be found @DEFAULT_ENCODING = 'windows-1252' #Detect encoding iff no explicit "transport level" encoding is supplied - if @encoding.nil? or not HTML5lib.is_valid_encoding(@encoding) + if @encoding.nil? or not HTML5.is_valid_encoding(@encoding) @char_encoding = detect_encoding else @char_encoding = @encoding end # Read bytes from stream decoding them into Unicode - uString = @raw_stream.read + @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || '' if @char_encoding == 'windows-1252' @win1252 = true elsif @char_encoding != 'utf-8' begin require 'iconv' begin - uString = Iconv.iconv('utf-8', @char_encoding, uString).first + @buffer << @raw_stream.read unless @raw_stream.eof? + @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first rescue @win1252 = true end @@ -67,10 +73,8 @@ module HTML5lib end end - # Convert the unicode string into a list to be used as the data stream - @data_stream = uString - @queue = [] + @errors = [] # Reset position in the list to read from @tell = 0 @@ -109,9 +113,22 @@ module HTML5lib begin require 'rubygems' require 'UniversalDetector' # gem install chardet - buffer = @raw_stream.read - encoding = UniversalDetector::chardet(buffer)['encoding'] - seek(buffer, 0) + buffers = [] + detector = UniversalDetector::Detector.instance + detector.reset + until @raw_stream.eof? + buffer = @raw_stream.read(@NUM_BYTES_CHARDET) + break if !buffer or buffer.empty? + buffers << buffer + detector.feed(buffer) + break if detector.instance_eval {@done} + detector.instance_eval { + @_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar + } + end + detector.close + encoding = detector.result['encoding'] + seek(buffers*'', 0) rescue LoadError end end @@ -242,14 +259,20 @@ module HTML5lib unless @queue.empty? return @queue.shift else - c = @data_stream[@tell] + if @tell + 3 > @buffer.length and !@raw_stream.eof? + # read next block + @buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER) + @tell = 0 + end + + c = @buffer[@tell] @tell += 1 case c when 0x01 .. 0x7F if c == 0x0D # normalize newlines - @tell += 1 if @data_stream[@tell] == 0x0A + @tell += 1 if @buffer[@tell] == 0x0A c = 0x0A end @@ -276,7 +299,7 @@ module HTML5lib when 0xC0 .. 0xFF if @win1252 "\xC3" + (c-64).chr # convert to utf-8 - elsif @data_stream[@tell-1 .. -1] =~ /^ + elsif @buffer[@tell-1 .. @tell+3] =~ /^ ( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte @@ -292,6 +315,8 @@ module HTML5lib end when 0x00 + @errors.push('null character found in input stream, ' + + 'replaced with U+FFFD') [0xFFFD].pack('U') # null characters are invalid else @@ -317,6 +342,10 @@ module HTML5lib @queue.insert(0, c) unless c == :EOF return char_stack.join('') end + + def unget(characters) + @queue.unshift(*characters.to_a) unless characters == :EOF + end end # String-like object with an assosiated position and various extra methods @@ -433,14 +462,14 @@ module HTML5lib if attr[0] == 'charset' tentative_encoding = attr[1] - if HTML5lib.is_valid_encoding(tentative_encoding) + if HTML5.is_valid_encoding(tentative_encoding) @encoding = tentative_encoding return false end elsif attr[0] == 'content' content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1])) tentative_encoding = content_parser.parse - if HTML5lib.is_valid_encoding(tentative_encoding) + if HTML5.is_valid_encoding(tentative_encoding) @encoding = tentative_encoding return false end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb b/vendor/plugins/HTML5lib/lib/html5/liberalxmlparser.rb similarity index 87% rename from vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb rename to vendor/plugins/HTML5lib/lib/html5/liberalxmlparser.rb index bbcf0eac..eae80ff7 100755 --- a/vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb +++ b/vendor/plugins/HTML5lib/lib/html5/liberalxmlparser.rb @@ -11,10 +11,10 @@ # # @@TODO: # * Selectively lowercase only XHTML, but not foreign markup -require 'html5lib/html5parser' -require 'html5lib/constants' +require 'html5/html5parser' +require 'html5/constants' -module HTML5lib +module HTML5 # liberal XML parser class XMLParser < HTMLParser @@ -25,25 +25,35 @@ module HTML5lib end def normalizeToken(token) - if token[:type] == :StartTag or token[:type] == :EmptyTag + case token[:type] + when :StartTag, :EmptyTag # We need to remove the duplicate attributes and convert attributes - # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} + # to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} token[:data] = Hash[*token[:data].reverse.flatten] # For EmptyTags, process both a Start and an End tag if token[:type] == :EmptyTag + save = @tokenizer.contentModelFlag @phase.processStartTag(token[:name], token[:data]) + @tokenizer.contentModelFlag = save token[:data] = {} token[:type] = :EndTag end - elsif token[:type] == :EndTag + when :Characters + # un-escape RCDATA_ELEMENTS (e.g. style, script) + if @tokenizer.contentModelFlag == :CDATA + token[:data] = token[:data]. + gsub('<','<').gsub('>','>').gsub('&','&') + end + + when :EndTag if token[:data] parseError(_("End tag contains unexpected attributes.")) end - elsif token[:type] == :Comment + when :Comment # Rescue CDATA from the comments if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]" token[:type] = :Characters diff --git a/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb b/vendor/plugins/HTML5lib/lib/html5/sanitizer.rb similarity index 99% rename from vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb rename to vendor/plugins/HTML5lib/lib/html5/sanitizer.rb index 5af9cf51..44f20c60 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb +++ b/vendor/plugins/HTML5lib/lib/html5/sanitizer.rb @@ -1,6 +1,7 @@ require 'cgi' +require 'html5/tokenizer' -module HTML5lib +module HTML5 # This module provides sanitization of XHTML+MathML+SVG # and of inline style attributes. diff --git a/vendor/plugins/HTML5lib/lib/html5/serializer.rb b/vendor/plugins/HTML5lib/lib/html5/serializer.rb new file mode 100644 index 00000000..f7187b7b --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5/serializer.rb @@ -0,0 +1,2 @@ +require 'html5/serializer/htmlserializer' +require 'html5/serializer/xhtmlserializer' diff --git a/vendor/plugins/HTML5lib/lib/html5lib/serializer/htmlserializer.rb b/vendor/plugins/HTML5lib/lib/html5/serializer/htmlserializer.rb similarity index 93% rename from vendor/plugins/HTML5lib/lib/html5lib/serializer/htmlserializer.rb rename to vendor/plugins/HTML5lib/lib/html5/serializer/htmlserializer.rb index a03b7d79..3f4eb812 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/serializer/htmlserializer.rb +++ b/vendor/plugins/HTML5lib/lib/html5/serializer/htmlserializer.rb @@ -1,6 +1,6 @@ -require 'html5lib/constants' +require 'html5/constants' -module HTML5lib +module HTML5 class HTMLSerializer @@ -21,6 +21,7 @@ module HTML5lib @use_trailing_solidus = false @space_before_trailing_solidus = true @escape_lt_in_attrs = false + @escape_rcdata = false @omit_optional_tags = true @sanitize = false @@ -43,22 +44,22 @@ module HTML5lib @errors = [] if encoding and @inject_meta_charset - require 'html5lib/filters/inject_meta_charset' + require 'html5/filters/inject_meta_charset' treewalker = Filters::InjectMetaCharset.new(treewalker, encoding) end if @strip_whitespace - require 'html5lib/filters/whitespace' + require 'html5/filters/whitespace' treewalker = Filters::WhitespaceFilter.new(treewalker) end if @sanitize - require 'html5lib/filters/sanitizer' + require 'html5/filters/sanitizer' treewalker = Filters::HTMLSanitizeFilter.new(treewalker) end if @omit_optional_tags - require 'html5lib/filters/optionaltags' + require 'html5/filters/optionaltags' treewalker = Filters::OptionalTagFilter.new(treewalker) end @@ -81,7 +82,7 @@ module HTML5lib elsif [:StartTag, :EmptyTag].include? type name = token[:name] - if RCDATA_ELEMENTS.include?(name) + if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata in_cdata = true elsif in_cdata serializeError(_("Unexpected child element of a CDATA element")) diff --git a/vendor/plugins/HTML5lib/lib/html5lib/serializer/xhtmlserializer.rb b/vendor/plugins/HTML5lib/lib/html5/serializer/xhtmlserializer.rb similarity index 72% rename from vendor/plugins/HTML5lib/lib/html5lib/serializer/xhtmlserializer.rb rename to vendor/plugins/HTML5lib/lib/html5/serializer/xhtmlserializer.rb index 43a63788..1e2885a6 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/serializer/xhtmlserializer.rb +++ b/vendor/plugins/HTML5lib/lib/html5/serializer/xhtmlserializer.rb @@ -1,6 +1,6 @@ -require 'html5lib/serializer/htmlserializer' +require 'html5/serializer/htmlserializer' -module HTML5lib +module HTML5 class XHTMLSerializer < HTMLSerializer DEFAULTS = { @@ -8,7 +8,8 @@ module HTML5lib :minimize_boolean_attributes => false, :use_trailing_solidus => true, :escape_lt_in_attrs => true, - :omit_optional_tags => false + :omit_optional_tags => false, + :escape_rcdata => true } def initialize(options={}) diff --git a/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb b/vendor/plugins/HTML5lib/lib/html5/tokenizer.rb similarity index 93% rename from vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb rename to vendor/plugins/HTML5lib/lib/html5/tokenizer.rb index 6519944d..0d31d9de 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb +++ b/vendor/plugins/HTML5lib/lib/html5/tokenizer.rb @@ -1,7 +1,7 @@ -require 'html5lib/constants' -require 'html5lib/inputstream' +require 'html5/constants' +require 'html5/inputstream' -module HTML5lib +module HTML5 # This class takes care of tokenizing HTML. # @@ -84,9 +84,9 @@ module HTML5lib # Start processing. When EOF is reached @state will return false # instead of true and the loop will terminate. while send @state - while not @tokenQueue.empty? - yield @tokenQueue.shift - end + yield :type => :ParseError, :data => @stream.errors.shift until + @stream.errors.empty? + yield @tokenQueue.shift until @tokenQueue.empty? end end @@ -109,7 +109,7 @@ module HTML5lib # The character we just consumed need to be put back on the stack so it # doesn't get lost... - @stream.queue.push(data) + @stream.unget(data) end # This function returns either U+FFFD or the character based on the @@ -128,7 +128,6 @@ module HTML5lib radix = 16 end - char = [0xFFFD].pack('U') charStack = [] # Consume all the characters that are in range while making sure we @@ -142,17 +141,25 @@ module HTML5lib # Convert the set of characters consumed to an int. charAsInt = charStack.join('').to_i(radix) - # If the integer is between 127 and 160 (so 128 and bigger and 159 and - # smaller) we need to do the "windows trick". - if (127...160).include? charAsInt + if charAsInt == 13 + @tokenQueue.push({:type => :ParseError, :data => + _("Incorrect CR newline entity. Replaced with LF.")}) + charAsInt = 10 + elsif (128..159).include? charAsInt + # If the integer is between 127 and 160 (so 128 and bigger and 159 + # and smaller) we need to do the "windows trick". @tokenQueue.push({:type => :ParseError, :data => _("Entity used with illegal number (windows-1252 reference).")}) charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128] end - if charAsInt > 0 and charAsInt <= 1114111 + if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343) char = [charAsInt].pack('U') + else + char = [0xFFFD].pack('U') + @tokenQueue.push({:type => :ParseError, :data => + _("Numeric entity represents an illegal codepoint.")}) end # Discard the ; if present. Otherwise, put it back on the queue and @@ -160,18 +167,18 @@ module HTML5lib if c != ";" @tokenQueue.push({:type => :ParseError, :data => _("Numeric entity didn't end with ';'.")}) - @stream.queue.push(c) + @stream.unget(c) end return char end - def consumeEntity + def consumeEntity(from_attribute=false) char = nil charStack = [@stream.char] if SPACE_CHARACTERS.include?(charStack[0]) or [:EOF, '<', '&'].include?(charStack[0]) - @stream.queue+= charStack + @stream.unget(charStack) elsif charStack[0] == "#" # We might have a number entity here. charStack += [@stream.char, @stream.char] @@ -179,22 +186,22 @@ module HTML5lib # If we reach the end of the file put everything up to :EOF # back in the queue charStack = charStack[0...charStack.index(:EOF)] - @stream.queue+= charStack + @stream.unget(charStack) @tokenQueue.push({:type => :ParseError, :data => _("Numeric entity expected. Got end of file instead.")}) else if charStack[1].downcase == "x" \ and HEX_DIGITS.include? charStack[2] # Hexadecimal entity detected. - @stream.queue.push(charStack[2]) + @stream.unget(charStack[2]) char = consumeNumberEntity(true) elsif DIGITS.include? charStack[1] # Decimal entity detected. - @stream.queue += charStack[1..-1] + @stream.unget(charStack[1..-1]) char = consumeNumberEntity(false) else # No number entity detected. - @stream.queue += charStack + @stream.unget(charStack) @tokenQueue.push({:type => :ParseError, :data => _("Numeric entity expected but none found.")}) end @@ -209,6 +216,8 @@ module HTML5lib filteredEntityList.reject! {|e| e[0].chr != charStack[0]} entityName = nil + # Try to find the longest entity the string will match to take care + # of ¬i for instance. while charStack[-1] != :EOF name = charStack.join('') if filteredEntityList.any? {|e| e[0...name.length] == name} @@ -220,6 +229,7 @@ module HTML5lib if ENTITIES.include? name entityName = name + break if entityName[-1] == ';' end end @@ -228,15 +238,23 @@ module HTML5lib # Check whether or not the last character returned can be # discarded or needs to be put back. - if not charStack[-1] == ";" + if entityName[-1] != ?; @tokenQueue.push({:type => :ParseError, :data => _("Named entity didn't end with ';'.")}) - @stream.queue += charStack[entityName.length..-1] + end + + if charStack[-1] != ";" and from_attribute and + (ASCII_LETTERS.include?(charStack[entityName.length]) or + DIGITS.include?(charStack[entityName.length])) + @stream.unget(charStack) + char = '&' + else + @stream.unget(charStack[entityName.length..-1]) end else @tokenQueue.push({:type => :ParseError, :data => _("Named entity expected. Got none.")}) - @stream.queue += charStack + @stream.unget(charStack) end end return char @@ -244,7 +262,7 @@ module HTML5lib # This method replaces the need for "entityInAttributeValueState". def processEntityInAttribute - entity = consumeEntity + entity = consumeEntity(true) if entity @currentToken[:data][-1][1] += entity else @@ -274,20 +292,23 @@ module HTML5lib @lastFourChars.shift if @lastFourChars.length > 4 end - if data == "&" and [:PCDATA,:RCDATA].include?(@contentModelFlag) - @state = @states[:entityData] + if data == "&" and !@escapeFlag and + [:PCDATA,:RCDATA].include?(@contentModelFlag) + @state = @states[:entityData] - elsif data == "-" and [:CDATA,:RCDATA].include?(@contentModelFlag) and - @escapeFlag == false and @lastFourChars.join('') == "" + elsif data == ">" and @escapeFlag and + [:CDATA,:RCDATA].include?(@contentModelFlag) and + @lastFourChars[1..-1].join('') == "-->" @escapeFlag = false @tokenQueue.push({:type => :Characters, :data => data}) @@ -345,14 +366,14 @@ module HTML5lib @tokenQueue.push({:type => :ParseError, :data => _("Expected tag name. Got '?' instead (HTML doesn't " + "support processing instructions).")}) - @stream.queue.push(data) + @stream.unget(data) @state = @states[:bogusComment] else # XXX @tokenQueue.push({:type => :ParseError, :data => _("Expected tag name. Got something else instead")}) @tokenQueue.push({:type => :Characters, :data => "<"}) - @stream.queue.push(data) + @stream.unget(data) @state = @states[:data] end else @@ -363,7 +384,7 @@ module HTML5lib @state = @states[:closeTagOpen] else @tokenQueue.push({:type => :Characters, :data => "<"}) - @stream.queue.insert(0, data) + @stream.unget(data) @state = @states[:data] end end @@ -388,7 +409,7 @@ module HTML5lib # Since this is just for checking. We put the characters back on # the stack. - @stream.queue += charStack + @stream.unget(charStack) end if @currentToken and @@ -426,7 +447,7 @@ module HTML5lib # XXX data can be _'_... @tokenQueue.push({:type => :ParseError, :data => _("Expected closing tag. Unexpected character '#{data}' found.")}) - @stream.queue.push(data) + @stream.unget(data) @state = @states[:bogusComment] end @@ -556,7 +577,7 @@ module HTML5lib @state = @states[:attributeValueDoubleQuoted] elsif data == "&" @state = @states[:attributeValueUnQuoted] - @stream.queue.push(data); + @stream.unget(data); elsif data == "'" @state = @states[:attributeValueSingleQuoted] elsif data == ">" @@ -656,7 +677,7 @@ module HTML5lib else @tokenQueue.push({:type => :ParseError, :data => _("Expected '--' or 'DOCTYPE'. Not found.")}) - @stream.queue += charStack + @stream.unget(charStack) @state = @states[:bogusComment] end end @@ -771,7 +792,7 @@ module HTML5lib else @tokenQueue.push({:type => :ParseError, :data => _("No space after literal string 'DOCTYPE'.")}) - @stream.queue.push(data) + @stream.unget(data) @state = @states[:beforeDoctypeName] end return true @@ -827,7 +848,7 @@ module HTML5lib @state = @states[:data] elsif data == :EOF @currentToken[:data] = true - @stream.queue.push(data) + @stream.unget(data) @tokenQueue.push({:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}) @currentToken[:correct] = false @@ -842,7 +863,7 @@ module HTML5lib elsif token == "system" @state = @states[:beforeDoctypeSystemIdentifier] else - @stream.queue += charStack + @stream.unget(charStack) @tokenQueue.push({:type => :ParseError, :data => _("Expected 'public' or 'system'. Got '#{charStack.join('')}'")}) @state = @states[:bogusDoctype] @@ -1028,7 +1049,7 @@ module HTML5lib @state = @states[:data] elsif data == :EOF # XXX EMIT - @stream.queue.push(data) + @stream.unget(data) @tokenQueue.push({:type => :ParseError, :data => _("Unexpected end of file in bogus doctype.")}) @currentToken[:correct] = false diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb b/vendor/plugins/HTML5lib/lib/html5/treebuilders.rb similarity index 70% rename from vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb rename to vendor/plugins/HTML5lib/lib/html5/treebuilders.rb index 9fa49975..8c5bdd55 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb +++ b/vendor/plugins/HTML5lib/lib/html5/treebuilders.rb @@ -1,17 +1,17 @@ -module HTML5lib +module HTML5 module TreeBuilders class << self def [](name) case name.to_s.downcase when 'simpletree' then - require 'html5lib/treebuilders/simpletree' + require 'html5/treebuilders/simpletree' SimpleTree::TreeBuilder when 'rexml' then - require 'html5lib/treebuilders/rexml' + require 'html5/treebuilders/rexml' REXML::TreeBuilder when 'hpricot' then - require 'html5lib/treebuilders/hpricot' + require 'html5/treebuilders/hpricot' Hpricot::TreeBuilder else raise "Unknown TreeBuilder #{name}" diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb b/vendor/plugins/HTML5lib/lib/html5/treebuilders/base.rb similarity index 99% rename from vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb rename to vendor/plugins/HTML5lib/lib/html5/treebuilders/base.rb index 0d1082bd..f5d689db 100755 --- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb +++ b/vendor/plugins/HTML5lib/lib/html5/treebuilders/base.rb @@ -1,8 +1,8 @@ -require 'html5lib/constants' +require 'html5/constants' #XXX - TODO; make the default interface more ElementTree-like rather than DOM-like -module HTML5lib +module HTML5 # The scope markers are inserted when entering buttons, object elements, # marquees, table cells, and table captions, and are used to prevent formatting diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb b/vendor/plugins/HTML5lib/lib/html5/treebuilders/hpricot.rb similarity index 95% rename from vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb rename to vendor/plugins/HTML5lib/lib/html5/treebuilders/hpricot.rb index 20cc58b6..48c9a12d 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb +++ b/vendor/plugins/HTML5lib/lib/html5/treebuilders/hpricot.rb @@ -1,221 +1,221 @@ -require 'html5lib/treebuilders/base' -require 'rubygems' -require 'hpricot' -require 'forwardable' - -module HTML5lib - module TreeBuilders - module Hpricot - - class Node < Base::Node - - extend Forwardable - - def_delegators :@hpricot, :name - - attr_accessor :hpricot - - def initialize(name) - super(name) - @hpricot = self.class.hpricot_class.new name - end - - def appendChild(node) - if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode) - childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s - else - childNodes << node - hpricot.children << node.hpricot - end - if (oldparent = node.hpricot.parent) != nil - oldparent.children.delete_at(oldparent.children.index(node.hpricot)) - end - node.hpricot.parent = hpricot - node.parent = self - end - - def removeChild(node) - childNodes.delete(node) - hpricot.children.delete_at(hpricot.children.index(node.hpricot)) - node.hpricot.parent = nil - node.parent = nil - end - - def insertText(data, before=nil) - if before - insertBefore(TextNode.new(data), before) - else - appendChild(TextNode.new(data)) - end - end - - def insertBefore(node, refNode) - index = childNodes.index(refNode) - if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode) - childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s - else - refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot) - childNodes.insert(index, node) - end - end - - def hasContent - childNodes.any? - end - end - - class Element < Node - def self.hpricot_class - ::Hpricot::Elem - end - - def initialize(name) - super(name) - - @hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name)) - end - - def name - @hpricot.stag.name - end - - def cloneNode - attributes.inject(self.class.new(name)) do |node, (name, value)| - node.hpricot[name] = value - node - end - end - - # A call to Hpricot::Elem#raw_attributes is built dynamically, - # so alterations to the returned value (a hash) will be lost. - # - # AttributeProxy works around this by forwarding :[]= calls - # to the raw_attributes accessor on the element start tag. - # - class AttributeProxy - def initialize(hpricot) - @hpricot = hpricot - end - - def []=(k, v) - @hpricot.stag.send(stag_attributes_method)[k] = v - end - - def stag_attributes_method - # STag#attributes changed to STag#raw_attributes after Hpricot 0.5 - @hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes - end - - def method_missing(*a, &b) - @hpricot.attributes.send(*a, &b) - end - end - - def attributes - AttributeProxy.new(@hpricot) - end - - def attributes=(attrs) - attrs.each { |name, value| @hpricot[name] = value } - end - - def printTree(indent=0) - tree = "\n|#{' ' * indent}<#{name}>" - indent += 2 - attributes.each do |name, value| - next if name == 'xmlns' - tree += "\n|#{' ' * indent}#{name}=\"#{value}\"" - end - childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) } - end - end - - class Document < Node - def self.hpricot_class - ::Hpricot::Doc - end - - def initialize - super(nil) - end - - def printTree(indent=0) - childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) } - end - end - - class DocumentType < Node - def self.hpricot_class - ::Hpricot::DocType - end - - def initialize(name) - begin - super(name) - rescue ArgumentError # needs 3... - end - - @hpricot = ::Hpricot::DocType.new(name, nil, nil) - end - - def printTree(indent=0) - "\n|#{' ' * indent}" - end - end - - class DocumentFragment < Element - def initialize - super('') - end - - def printTree(indent=0) - childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) } - end - end - - class TextNode < Node - def initialize(data) - @hpricot = ::Hpricot::Text.new(data) - end - - def printTree(indent=0) - "\n|#{' ' * indent}\"#{hpricot.content}\"" - end - end - - class CommentNode < Node - def self.hpricot_class - ::Hpricot::Comment - end - - def printTree(indent=0) - "\n|#{' ' * indent}" - end - end - - class TreeBuilder < Base::TreeBuilder - def initialize - @documentClass = Document - @doctypeClass = DocumentType - @elementClass = Element - @commentClass = CommentNode - @fragmentClass = DocumentFragment - end - - def testSerializer(node) - node.printTree - end - - def getDocument - @document.hpricot - end - - def getFragment - @document = super - return @document.hpricot.children - end - end - - end - end -end +require 'html5/treebuilders/base' +require 'rubygems' +require 'hpricot' +require 'forwardable' + +module HTML5 + module TreeBuilders + module Hpricot + + class Node < Base::Node + + extend Forwardable + + def_delegators :@hpricot, :name + + attr_accessor :hpricot + + def initialize(name) + super(name) + @hpricot = self.class.hpricot_class.new name + end + + def appendChild(node) + if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode) + childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s + else + childNodes << node + hpricot.children << node.hpricot + end + if (oldparent = node.hpricot.parent) != nil + oldparent.children.delete_at(oldparent.children.index(node.hpricot)) + end + node.hpricot.parent = hpricot + node.parent = self + end + + def removeChild(node) + childNodes.delete(node) + hpricot.children.delete_at(hpricot.children.index(node.hpricot)) + node.hpricot.parent = nil + node.parent = nil + end + + def insertText(data, before=nil) + if before + insertBefore(TextNode.new(data), before) + else + appendChild(TextNode.new(data)) + end + end + + def insertBefore(node, refNode) + index = childNodes.index(refNode) + if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode) + childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s + else + refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot) + childNodes.insert(index, node) + end + end + + def hasContent + childNodes.any? + end + end + + class Element < Node + def self.hpricot_class + ::Hpricot::Elem + end + + def initialize(name) + super(name) + + @hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name)) + end + + def name + @hpricot.stag.name + end + + def cloneNode + attributes.inject(self.class.new(name)) do |node, (name, value)| + node.hpricot[name] = value + node + end + end + + # A call to Hpricot::Elem#raw_attributes is built dynamically, + # so alterations to the returned value (a hash) will be lost. + # + # AttributeProxy works around this by forwarding :[]= calls + # to the raw_attributes accessor on the element start tag. + # + class AttributeProxy + def initialize(hpricot) + @hpricot = hpricot + end + + def []=(k, v) + @hpricot.stag.send(stag_attributes_method)[k] = v + end + + def stag_attributes_method + # STag#attributes changed to STag#raw_attributes after Hpricot 0.5 + @hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes + end + + def method_missing(*a, &b) + @hpricot.attributes.send(*a, &b) + end + end + + def attributes + AttributeProxy.new(@hpricot) + end + + def attributes=(attrs) + attrs.each { |name, value| @hpricot[name] = value } + end + + def printTree(indent=0) + tree = "\n|#{' ' * indent}<#{name}>" + indent += 2 + attributes.each do |name, value| + next if name == 'xmlns' + tree += "\n|#{' ' * indent}#{name}=\"#{value}\"" + end + childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) } + end + end + + class Document < Node + def self.hpricot_class + ::Hpricot::Doc + end + + def initialize + super(nil) + end + + def printTree(indent=0) + childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) } + end + end + + class DocumentType < Node + def self.hpricot_class + ::Hpricot::DocType + end + + def initialize(name) + begin + super(name) + rescue ArgumentError # needs 3... + end + + @hpricot = ::Hpricot::DocType.new(name, nil, nil) + end + + def printTree(indent=0) + "\n|#{' ' * indent}" + end + end + + class DocumentFragment < Element + def initialize + super('') + end + + def printTree(indent=0) + childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) } + end + end + + class TextNode < Node + def initialize(data) + @hpricot = ::Hpricot::Text.new(data) + end + + def printTree(indent=0) + "\n|#{' ' * indent}\"#{hpricot.content}\"" + end + end + + class CommentNode < Node + def self.hpricot_class + ::Hpricot::Comment + end + + def printTree(indent=0) + "\n|#{' ' * indent}" + end + end + + class TreeBuilder < Base::TreeBuilder + def initialize + @documentClass = Document + @doctypeClass = DocumentType + @elementClass = Element + @commentClass = CommentNode + @fragmentClass = DocumentFragment + end + + def testSerializer(node) + node.printTree + end + + def getDocument + @document.hpricot + end + + def getFragment + @document = super + return @document.hpricot.children + end + end + + end + end +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb b/vendor/plugins/HTML5lib/lib/html5/treebuilders/rexml.rb similarity index 98% rename from vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb rename to vendor/plugins/HTML5lib/lib/html5/treebuilders/rexml.rb index f6aad877..a8181430 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb +++ b/vendor/plugins/HTML5lib/lib/html5/treebuilders/rexml.rb @@ -1,8 +1,8 @@ -require 'html5lib/treebuilders/base' +require 'html5/treebuilders/base' require 'rexml/document' require 'forwardable' -module HTML5lib +module HTML5 module TreeBuilders module REXML diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/simpletree.rb b/vendor/plugins/HTML5lib/lib/html5/treebuilders/simpletree.rb similarity index 98% rename from vendor/plugins/HTML5lib/lib/html5lib/treebuilders/simpletree.rb rename to vendor/plugins/HTML5lib/lib/html5/treebuilders/simpletree.rb index 83034bff..827c0c0d 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/simpletree.rb +++ b/vendor/plugins/HTML5lib/lib/html5/treebuilders/simpletree.rb @@ -1,6 +1,6 @@ -require 'html5lib/treebuilders/base' +require 'html5/treebuilders/base' -module HTML5lib +module HTML5 module TreeBuilders module SimpleTree diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb b/vendor/plugins/HTML5lib/lib/html5/treewalkers.rb similarity index 66% rename from vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb rename to vendor/plugins/HTML5lib/lib/html5/treewalkers.rb index 2074768c..82c73bb7 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb +++ b/vendor/plugins/HTML5lib/lib/html5/treewalkers.rb @@ -1,19 +1,19 @@ -require 'html5lib/treewalkers/base' +require 'html5/treewalkers/base' -module HTML5lib +module HTML5 module TreeWalkers class << self def [](name) case name.to_s.downcase when 'simpletree' then - require 'html5lib/treewalkers/simpletree' + require 'html5/treewalkers/simpletree' SimpleTree::TreeWalker when 'rexml' then - require 'html5lib/treewalkers/rexml' + require 'html5/treewalkers/rexml' REXML::TreeWalker when 'hpricot' then - require 'html5lib/treewalkers/hpricot' + require 'html5/treewalkers/hpricot' Hpricot::TreeWalker else raise "Unknown TreeWalker #{name}" diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb b/vendor/plugins/HTML5lib/lib/html5/treewalkers/base.rb similarity index 98% rename from vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb rename to vendor/plugins/HTML5lib/lib/html5/treewalkers/base.rb index 21d4d3f7..394f8c07 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb +++ b/vendor/plugins/HTML5lib/lib/html5/treewalkers/base.rb @@ -1,5 +1,5 @@ -require 'html5lib/constants' -module HTML5lib +require 'html5/constants' +module HTML5 module TreeWalkers module TokenConstructor diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb b/vendor/plugins/HTML5lib/lib/html5/treewalkers/hpricot.rb similarity index 89% rename from vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb rename to vendor/plugins/HTML5lib/lib/html5/treewalkers/hpricot.rb index c9d12263..75cde344 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb +++ b/vendor/plugins/HTML5lib/lib/html5/treewalkers/hpricot.rb @@ -1,10 +1,10 @@ -require 'html5lib/treewalkers/base' +require 'html5/treewalkers/base' require 'rexml/document' -module HTML5lib +module HTML5 module TreeWalkers module Hpricot - class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker + class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker def node_details(node) case node diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb b/vendor/plugins/HTML5lib/lib/html5/treewalkers/rexml.rb similarity index 89% rename from vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb rename to vendor/plugins/HTML5lib/lib/html5/treewalkers/rexml.rb index c6881d97..695dc154 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb +++ b/vendor/plugins/HTML5lib/lib/html5/treewalkers/rexml.rb @@ -1,10 +1,10 @@ -require 'html5lib/treewalkers/base' +require 'html5/treewalkers/base' require 'rexml/document' -module HTML5lib +module HTML5 module TreeWalkers module REXML - class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker + class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker def node_details(node) case node diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb b/vendor/plugins/HTML5lib/lib/html5/treewalkers/simpletree.rb similarity index 86% rename from vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb rename to vendor/plugins/HTML5lib/lib/html5/treewalkers/simpletree.rb index 37ebf32a..3194389b 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb +++ b/vendor/plugins/HTML5lib/lib/html5/treewalkers/simpletree.rb @@ -1,10 +1,10 @@ -require 'html5lib/treewalkers/base' +require 'html5/treewalkers/base' -module HTML5lib +module HTML5 module TreeWalkers module SimpleTree - class TreeWalker < HTML5lib::TreeWalkers::Base - include HTML5lib::TreeBuilders::SimpleTree + class TreeWalker < HTML5::TreeWalkers::Base + include HTML5::TreeBuilders::SimpleTree def walk(node) case node diff --git a/vendor/plugins/HTML5lib/lib/html5lib/constants.rb b/vendor/plugins/HTML5lib/lib/html5lib/constants.rb deleted file mode 100755 index 8144c93f..00000000 --- a/vendor/plugins/HTML5lib/lib/html5lib/constants.rb +++ /dev/null @@ -1,708 +0,0 @@ -module HTML5lib - - class EOF < Exception; end - - CONTENT_MODEL_FLAGS = [ - :PCDATA, - :RCDATA, - :CDATA, - :PLAINTEXT - ] - - SCOPING_ELEMENTS = %w[ - button - caption - html - marquee - object - table - td - th - ] - - FORMATTING_ELEMENTS = %w[ - a - b - big - em - font - i - nobr - s - small - strike - strong - tt - u - ] - - SPECIAL_ELEMENTS = %w[ - address - area - base - basefont - bgsound - blockquote - body - br - center - col - colgroup - dd - dir - div - dl - dt - embed - fieldset - form - frame - frameset - h1 - h2 - h3 - h4 - h5 - h6 - head - hr - iframe - image - img - input - isindex - li - link - listing - menu - meta - noembed - noframes - noscript - ol - optgroup - option - p - param - plaintext - pre - script - select - spacer - style - tbody - textarea - tfoot - thead - title - tr - ul - wbr - ] - - SPACE_CHARACTERS = %W[ - \t - \n - \x0B - \x0C - \x20 - \r - ] - - TABLE_INSERT_MODE_ELEMENTS = %w[ - table - tbody - tfoot - thead - tr - ] - - ASCII_LOWERCASE = ('a'..'z').to_a.join('') - ASCII_UPPERCASE = ('A'..'Z').to_a.join('') - ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE - DIGITS = '0'..'9' - HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a - - # Heading elements need to be ordered - HEADING_ELEMENTS = %w[ - h1 - h2 - h3 - h4 - h5 - h6 - ] - - # XXX What about event-source and command? - VOID_ELEMENTS = %w[ - base - link - meta - hr - br - img - embed - param - area - col - input - ] - - CDATA_ELEMENTS = %w[title textarea] - - RCDATA_ELEMENTS = %w[ - style - script - xmp - iframe - noembed - noframes - noscript - ] - - BOOLEAN_ATTRIBUTES = { - :global => %w[irrelevant], - 'style' => %w[scoped], - 'img' => %w[ismap], - 'audio' => %w[autoplay controls], - 'video' => %w[autoplay controls], - 'script' => %w[defer async], - 'details' => %w[open], - 'datagrid' => %w[multiple disabled], - 'command' => %w[hidden disabled checked default], - 'menu' => %w[autosubmit], - 'fieldset' => %w[disabled readonly], - 'option' => %w[disabled readonly selected], - 'optgroup' => %w[disabled readonly], - 'button' => %w[disabled autofocus], - 'input' => %w[disabled readonly required autofocus checked ismap], - 'select' => %w[disabled readonly autofocus multiple], - 'output' => %w[disabled readonly] - } - - # entitiesWindows1252 has to be _ordered_ and needs to have an index. - ENTITIES_WINDOWS1252 = [ - 8364, # 0x80 0x20AC EURO SIGN - 65533, # 0x81 UNDEFINED - 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK - 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK - 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK - 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS - 8224, # 0x86 0x2020 DAGGER - 8225, # 0x87 0x2021 DOUBLE DAGGER - 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT - 8240, # 0x89 0x2030 PER MILLE SIGN - 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON - 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK - 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE - 65533, # 0x8D UNDEFINED - 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON - 65533, # 0x8F UNDEFINED - 65533, # 0x90 UNDEFINED - 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK - 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK - 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK - 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK - 8226, # 0x95 0x2022 BULLET - 8211, # 0x96 0x2013 EN DASH - 8212, # 0x97 0x2014 EM DASH - 732, # 0x98 0x02DC SMALL TILDE - 8482, # 0x99 0x2122 TRADE MARK SIGN - 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON - 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK - 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE - 65533, # 0x9D UNDEFINED - 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON - 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS - ] - - private - - def self.U n - [n].pack('U') - end - - public - - ENTITIES = { - "AElig" => U(0xC6), - "Aacute" => U(0xC1), - "Acirc" => U(0xC2), - "Agrave" => U(0xC0), - "Alpha" => U(0x0391), - "Aring" => U(0xC5), - "Atilde" => U(0xC3), - "Auml" => U(0xC4), - "Beta" => U(0x0392), - "Ccedil" => U(0xC7), - "Chi" => U(0x03A7), - "Dagger" => U(0x2021), - "Delta" => U(0x0394), - "ETH" => U(0xD0), - "Eacute" => U(0xC9), - "Ecirc" => U(0xCA), - "Egrave" => U(0xC8), - "Epsilon" => U(0x0395), - "Eta" => U(0x0397), - "Euml" => U(0xCB), - "Gamma" => U(0x0393), - "Iacute" => U(0xCD), - "Icirc" => U(0xCE), - "Igrave" => U(0xCC), - "Iota" => U(0x0399), - "Iuml" => U(0xCF), - "Kappa" => U(0x039A), - "Lambda" => U(0x039B), - "Mu" => U(0x039C), - "Ntilde" => U(0xD1), - "Nu" => U(0x039D), - "OElig" => U(0x0152), - "Oacute" => U(0xD3), - "Ocirc" => U(0xD4), - "Ograve" => U(0xD2), - "Omega" => U(0x03A9), - "Omicron" => U(0x039F), - "Oslash" => U(0xD8), - "Otilde" => U(0xD5), - "Ouml" => U(0xD6), - "Phi" => U(0x03A6), - "Pi" => U(0x03A0), - "Prime" => U(0x2033), - "Psi" => U(0x03A8), - "Rho" => U(0x03A1), - "Scaron" => U(0x0160), - "Sigma" => U(0x03A3), - "THORN" => U(0xDE), - "Tau" => U(0x03A4), - "Theta" => U(0x0398), - "Uacute" => U(0xDA), - "Ucirc" => U(0xDB), - "Ugrave" => U(0xD9), - "Upsilon" => U(0x03A5), - "Uuml" => U(0xDC), - "Xi" => U(0x039E), - "Yacute" => U(0xDD), - "Yuml" => U(0x0178), - "Zeta" => U(0x0396), - "aacute" => U(0xE1), - "acirc" => U(0xE2), - "acute" => U(0xB4), - "aelig" => U(0xE6), - "agrave" => U(0xE0), - "alefsym" => U(0x2135), - "alpha" => U(0x03B1), - "amp" => U(0x26), - "AMP" => U(0x26), - "and" => U(0x2227), - "ang" => U(0x2220), - "apos" => U(0x27), - "aring" => U(0xE5), - "asymp" => U(0x2248), - "atilde" => U(0xE3), - "auml" => U(0xE4), - "bdquo" => U(0x201E), - "beta" => U(0x03B2), - "brvbar" => U(0xA6), - "bull" => U(0x2022), - "cap" => U(0x2229), - "ccedil" => U(0xE7), - "cedil" => U(0xB8), - "cent" => U(0xA2), - "chi" => U(0x03C7), - "circ" => U(0x02C6), - "clubs" => U(0x2663), - "cong" => U(0x2245), - "copy" => U(0xA9), - "COPY" => U(0xA9), - "crarr" => U(0x21B5), - "cup" => U(0x222A), - "curren" => U(0xA4), - "dArr" => U(0x21D3), - "dagger" => U(0x2020), - "darr" => U(0x2193), - "deg" => U(0xB0), - "delta" => U(0x03B4), - "diams" => U(0x2666), - "divide" => U(0xF7), - "eacute" => U(0xE9), - "ecirc" => U(0xEA), - "egrave" => U(0xE8), - "empty" => U(0x2205), - "emsp" => U(0x2003), - "ensp" => U(0x2002), - "epsilon" => U(0x03B5), - "equiv" => U(0x2261), - "eta" => U(0x03B7), - "eth" => U(0xF0), - "euml" => U(0xEB), - "euro" => U(0x20AC), - "exist" => U(0x2203), - "fnof" => U(0x0192), - "forall" => U(0x2200), - "frac12" => U(0xBD), - "frac14" => U(0xBC), - "frac34" => U(0xBE), - "frasl" => U(0x2044), - "gamma" => U(0x03B3), - "ge" => U(0x2265), - "gt" => U(0x3E), - "GT" => U(0x3E), - "hArr" => U(0x21D4), - "harr" => U(0x2194), - "hearts" => U(0x2665), - "hellip" => U(0x2026), - "iacute" => U(0xED), - "icirc" => U(0xEE), - "iexcl" => U(0xA1), - "igrave" => U(0xEC), - "image" => U(0x2111), - "infin" => U(0x221E), - "int" => U(0x222B), - "iota" => U(0x03B9), - "iquest" => U(0xBF), - "isin" => U(0x2208), - "iuml" => U(0xEF), - "kappa" => U(0x03BA), - "lArr" => U(0x21D0), - "lambda" => U(0x03BB), - "lang" => U(0x2329), - "laquo" => U(0xAB), - "larr" => U(0x2190), - "lceil" => U(0x2308), - "ldquo" => U(0x201C), - "le" => U(0x2264), - "lfloor" => U(0x230A), - "lowast" => U(0x2217), - "loz" => U(0x25CA), - "lrm" => U(0x200E), - "lsaquo" => U(0x2039), - "lsquo" => U(0x2018), - "lt" => U(0x3C), - "LT" => U(0x3C), - "macr" => U(0xAF), - "mdash" => U(0x2014), - "micro" => U(0xB5), - "middot" => U(0xB7), - "minus" => U(0x2212), - "mu" => U(0x03BC), - "nabla" => U(0x2207), - "nbsp" => U(0xA0), - "ndash" => U(0x2013), - "ne" => U(0x2260), - "ni" => U(0x220B), - "not" => U(0xAC), - "notin" => U(0x2209), - "nsub" => U(0x2284), - "ntilde" => U(0xF1), - "nu" => U(0x03BD), - "oacute" => U(0xF3), - "ocirc" => U(0xF4), - "oelig" => U(0x0153), - "ograve" => U(0xF2), - "oline" => U(0x203E), - "omega" => U(0x03C9), - "omicron" => U(0x03BF), - "oplus" => U(0x2295), - "or" => U(0x2228), - "ordf" => U(0xAA), - "ordm" => U(0xBA), - "oslash" => U(0xF8), - "otilde" => U(0xF5), - "otimes" => U(0x2297), - "ouml" => U(0xF6), - "para" => U(0xB6), - "part" => U(0x2202), - "permil" => U(0x2030), - "perp" => U(0x22A5), - "phi" => U(0x03C6), - "pi" => U(0x03C0), - "piv" => U(0x03D6), - "plusmn" => U(0xB1), - "pound" => U(0xA3), - "prime" => U(0x2032), - "prod" => U(0x220F), - "prop" => U(0x221D), - "psi" => U(0x03C8), - "quot" => U(0x22), - "QUOT" => U(0x22), - "rArr" => U(0x21D2), - "radic" => U(0x221A), - "rang" => U(0x232A), - "raquo" => U(0xBB), - "rarr" => U(0x2192), - "rceil" => U(0x2309), - "rdquo" => U(0x201D), - "real" => U(0x211C), - "reg" => U(0xAE), - "REG" => U(0xAE), - "rfloor" => U(0x230B), - "rho" => U(0x03C1), - "rlm" => U(0x200F), - "rsaquo" => U(0x203A), - "rsquo" => U(0x2019), - "sbquo" => U(0x201A), - "scaron" => U(0x0161), - "sdot" => U(0x22C5), - "sect" => U(0xA7), - "shy" => U(0xAD), - "sigma" => U(0x03C3), - "sigmaf" => U(0x03C2), - "sim" => U(0x223C), - "spades" => U(0x2660), - "sub" => U(0x2282), - "sube" => U(0x2286), - "sum" => U(0x2211), - "sup" => U(0x2283), - "sup1" => U(0xB9), - "sup2" => U(0xB2), - "sup3" => U(0xB3), - "supe" => U(0x2287), - "szlig" => U(0xDF), - "tau" => U(0x03C4), - "there4" => U(0x2234), - "theta" => U(0x03B8), - "thetasym" => U(0x03D1), - "thinsp" => U(0x2009), - "thorn" => U(0xFE), - "tilde" => U(0x02DC), - "times" => U(0xD7), - "trade" => U(0x2122), - "uArr" => U(0x21D1), - "uacute" => U(0xFA), - "uarr" => U(0x2191), - "ucirc" => U(0xFB), - "ugrave" => U(0xF9), - "uml" => U(0xA8), - "upsih" => U(0x03D2), - "upsilon" => U(0x03C5), - "uuml" => U(0xFC), - "weierp" => U(0x2118), - "xi" => U(0x03BE), - "yacute" => U(0xFD), - "yen" => U(0xA5), - "yuml" => U(0xFF), - "zeta" => U(0x03B6), - "zwj" => U(0x200D), - "zwnj" => U(0x200C) - } - - ENCODINGS = %w[ - ansi_x3.4-1968 - iso-ir-6 - ansi_x3.4-1986 - iso_646.irv:1991 - ascii - iso646-us - us-ascii - us - ibm367 - cp367 - csascii - ks_c_5601-1987 - korean - iso-2022-kr - csiso2022kr - euc-kr - iso-2022-jp - csiso2022jp - iso-2022-jp-2 - iso-ir-58 - chinese - csiso58gb231280 - iso_8859-1:1987 - iso-ir-100 - iso_8859-1 - iso-8859-1 - latin1 - l1 - ibm819 - cp819 - csisolatin1 - iso_8859-2:1987 - iso-ir-101 - iso_8859-2 - iso-8859-2 - latin2 - l2 - csisolatin2 - iso_8859-3:1988 - iso-ir-109 - iso_8859-3 - iso-8859-3 - latin3 - l3 - csisolatin3 - iso_8859-4:1988 - iso-ir-110 - iso_8859-4 - iso-8859-4 - latin4 - l4 - csisolatin4 - iso_8859-6:1987 - iso-ir-127 - iso_8859-6 - iso-8859-6 - ecma-114 - asmo-708 - arabic - csisolatinarabic - iso_8859-7:1987 - iso-ir-126 - iso_8859-7 - iso-8859-7 - elot_928 - ecma-118 - greek - greek8 - csisolatingreek - iso_8859-8:1988 - iso-ir-138 - iso_8859-8 - iso-8859-8 - hebrew - csisolatinhebrew - iso_8859-5:1988 - iso-ir-144 - iso_8859-5 - iso-8859-5 - cyrillic - csisolatincyrillic - iso_8859-9:1989 - iso-ir-148 - iso_8859-9 - iso-8859-9 - latin5 - l5 - csisolatin5 - iso-8859-10 - iso-ir-157 - l6 - iso_8859-10:1992 - csisolatin6 - latin6 - hp-roman8 - roman8 - r8 - ibm037 - cp037 - csibm037 - ibm424 - cp424 - csibm424 - ibm437 - cp437 - 437 - cspc8codepage437 - ibm500 - cp500 - csibm500 - ibm775 - cp775 - cspc775baltic - ibm850 - cp850 - 850 - cspc850multilingual - ibm852 - cp852 - 852 - cspcp852 - ibm855 - cp855 - 855 - csibm855 - ibm857 - cp857 - 857 - csibm857 - ibm860 - cp860 - 860 - csibm860 - ibm861 - cp861 - 861 - cp-is - csibm861 - ibm862 - cp862 - 862 - cspc862latinhebrew - ibm863 - cp863 - 863 - csibm863 - ibm864 - cp864 - csibm864 - ibm865 - cp865 - 865 - csibm865 - ibm866 - cp866 - 866 - csibm866 - ibm869 - cp869 - 869 - cp-gr - csibm869 - ibm1026 - cp1026 - csibm1026 - koi8-r - cskoi8r - koi8-u - big5-hkscs - ptcp154 - csptcp154 - pt154 - cp154 - utf-7 - utf-16be - utf-16le - utf-16 - utf-8 - iso-8859-13 - iso-8859-14 - iso-ir-199 - iso_8859-14:1998 - iso_8859-14 - latin8 - iso-celtic - l8 - iso-8859-15 - iso_8859-15 - iso-8859-16 - iso-ir-226 - iso_8859-16:2001 - iso_8859-16 - latin10 - l10 - gbk - cp936 - ms936 - gb18030 - shift_jis - ms_kanji - csshiftjis - euc-jp - gb2312 - big5 - csbig5 - windows-1250 - windows-1251 - windows-1252 - windows-1253 - windows-1254 - windows-1255 - windows-1256 - windows-1257 - windows-1258 - tis-620 - hz-gb-2312 - ] - -end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/filters.rb b/vendor/plugins/HTML5lib/lib/html5lib/filters.rb deleted file mode 100644 index 05c3edd4..00000000 --- a/vendor/plugins/HTML5lib/lib/html5lib/filters.rb +++ /dev/null @@ -1 +0,0 @@ -require 'html5lib/filters/optionaltags' diff --git a/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb b/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb deleted file mode 100644 index cd4c66a6..00000000 --- a/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb +++ /dev/null @@ -1,2 +0,0 @@ -require 'html5lib/serializer/htmlserializer' -require 'html5lib/serializer/xhtmlserializer' diff --git a/vendor/plugins/HTML5lib/parse.rb b/vendor/plugins/HTML5lib/parse.rb index 79233712..ba0d9071 100755 --- a/vendor/plugins/HTML5lib/parse.rb +++ b/vendor/plugins/HTML5lib/parse.rb @@ -26,15 +26,15 @@ def parse(opts, args) exit(1) end - require 'html5lib/treebuilders' - treebuilder = HTML5lib::TreeBuilders[opts.treebuilder] + require 'html5/treebuilders' + treebuilder = HTML5::TreeBuilders[opts.treebuilder] if opts.output == :xml - require 'html5lib/liberalxmlparser' - p = HTML5lib::XHTMLParser.new(:tree=>treebuilder) + require 'html5/liberalxmlparser' + p = HTML5::XHTMLParser.new(:tree=>treebuilder) else - require 'html5lib/html5parser' - p = HTML5lib::HTMLParser.new(:tree=>treebuilder) + require 'html5/html5parser' + p = HTML5::HTMLParser.new(:tree=>treebuilder) end if opts.parsemethod == :parse @@ -70,10 +70,10 @@ def printOutput(parser, document, opts) when :xml print document when :html - require 'html5lib/treewalkers' - tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document) - require 'html5lib/serializer' - puts HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer) + require 'html5/treewalkers' + tokens = HTML5::TreeWalkers[opts.treebuilder].new(document) + require 'html5/serializer' + puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer) when :hilite print document.hilite when :tree @@ -188,6 +188,10 @@ opts = OptionParser.new do |opts| options.serializer[:escape_lt_in_attrs] = lt end + opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata| + options.serializer[:escape_rcdata] = rcdata + end + opts.separator "" opts.separator "Other Options:" diff --git a/vendor/plugins/HTML5lib/testdata/encoding/tests2.dat b/vendor/plugins/HTML5lib/testdata/encoding/tests2.dat index dd43f85c..dc74859c 100644 --- a/vendor/plugins/HTML5lib/testdata/encoding/tests2.dat +++ b/vendor/plugins/HTML5lib/testdata/encoding/tests2.dat @@ -33,7 +33,6 @@ EUC-jp #encoding EUC-jp - #data diff --git a/vendor/plugins/HTML5lib/testdata/serializer/core.test b/vendor/plugins/HTML5lib/testdata/serializer/core.test index fc981c14..d427822a 100644 --- a/vendor/plugins/HTML5lib/testdata/serializer/core.test +++ b/vendor/plugins/HTML5lib/testdata/serializer/core.test @@ -92,7 +92,8 @@ {"description": "rcdata", "input": [["StartTag", "script", {}], ["Characters", "ac&d"]], - "expected": ["

@@ -1511,6 +1515,7 @@ unexpected EOF | |

|


+|

#data @@ -1807,6 +1812,7 @@ Unexpected EOF | | |
+|

#data

@@ -1928,3 +1934,4 @@ Unexpected EOF | | | +|

diff --git a/vendor/plugins/HTML5lib/testdata/tree-construction/tests2.dat b/vendor/plugins/HTML5lib/testdata/tree-construction/tests2.dat index fdf8356a..0b83d94c 100755 --- a/vendor/plugins/HTML5lib/testdata/tree-construction/tests2.dat +++ b/vendor/plugins/HTML5lib/testdata/tree-construction/tests2.dat @@ -777,3 +777,4 @@ Unexpected

end tag. | | |
+|

diff --git a/vendor/plugins/HTML5lib/testdata/tree-construction/tests3.dat b/vendor/plugins/HTML5lib/testdata/tree-construction/tests3.dat index a66effff..b447d300 100644 --- a/vendor/plugins/HTML5lib/testdata/tree-construction/tests3.dat +++ b/vendor/plugins/HTML5lib/testdata/tree-construction/tests3.dat @@ -61,7 +61,6 @@ No DOCTYPE #data

-
 foo
#errors #document @@ -72,10 +71,22 @@ foo |
 |       "foo"
 
-
 #data
 
 
+foo
+#errors +#document +| +| +| +| +|
+|       "
+foo"
+
+#data
+
 foo
 
#errors @@ -183,7 +194,6 @@ y
#data #errors #document @@ -194,6 +204,20 @@ foo | +#errors +#document +| +| +| +| +|