From bd8ba1f4b123e2ba137b0c2b680c00c71578cbdf Mon Sep 17 00:00:00 2001 From: Jacques Distler Date: Tue, 5 Jun 2007 16:34:49 -0500 Subject: [PATCH] REXML Trees Synced with latest HTML5lib. Added preliminary support (currently disabled) for sanitizing REXML trees. --- lib/chunks/engines.rb | 7 +- lib/sanitize.rb | 21 +- lib/string_utils.rb | 17 + .../HTML5lib/lib/html5lib/constants.rb | 20 + .../HTML5lib/lib/html5lib/html5parser.rb | 4 +- .../html5lib/html5parser/in_table_phase.rb | 2 +- .../lib/html5lib/html5parser/phase.rb | 2 +- .../HTML5lib/lib/html5lib/inputstream.rb | 29 +- .../HTML5lib/lib/html5lib/sanitizer.rb | 43 +- .../HTML5lib/lib/html5lib/serializer.rb | 418 ++++++++++++++++++ .../HTML5lib/lib/html5lib/treebuilders.rb | 11 +- .../lib/html5lib/treebuilders/base.rb | 2 +- .../lib/html5lib/treebuilders/hpricot.rb | 4 + .../lib/html5lib/treebuilders/rexml.rb | 13 +- .../HTML5lib/lib/html5lib/treewalkers.rb | 26 ++ .../HTML5lib/lib/html5lib/treewalkers/base.rb | 156 +++++++ .../lib/html5lib/treewalkers/hpricot.rb | 48 ++ .../lib/html5lib/treewalkers/rexml.rb | 48 ++ .../lib/html5lib/treewalkers/simpletree.rb | 48 ++ vendor/plugins/HTML5lib/parse.rb | 137 ++++++ vendor/plugins/HTML5lib/tests/preamble.rb | 50 +++ .../plugins/HTML5lib/tests/test_encoding.rb | 36 +- vendor/plugins/HTML5lib/tests/test_parser.rb | 52 +-- .../plugins/HTML5lib/tests/test_sanitizer.rb | 4 +- .../plugins/HTML5lib/tests/test_serializer.rb | 52 +++ vendor/plugins/HTML5lib/tests/test_stream.rb | 54 +++ .../HTML5lib/tests/test_treewalkers.rb | 110 +++++ .../maruku/lib/maruku/output/to_html.rb | 15 + 28 files changed, 1317 insertions(+), 112 deletions(-) create mode 100644 vendor/plugins/HTML5lib/lib/html5lib/serializer.rb create mode 100644 vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb create mode 100644 vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb create mode 100644 vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb create mode 100644 vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb create mode 100644 vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb create mode 100755 vendor/plugins/HTML5lib/parse.rb create mode 100644 vendor/plugins/HTML5lib/tests/test_serializer.rb create mode 100755 vendor/plugins/HTML5lib/tests/test_stream.rb create mode 100644 vendor/plugins/HTML5lib/tests/test_treewalkers.rb diff --git a/lib/chunks/engines.rb b/lib/chunks/engines.rb index c870541a..4f11608b 100644 --- a/lib/chunks/engines.rb +++ b/lib/chunks/engines.rb @@ -53,9 +53,10 @@ module Engines def mask require_dependency 'maruku' require_dependency 'maruku/ext/math' - html = Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), - {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html - sanitize_xhtml(html.to_ncr) +# html = sanitize_rexml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), +# {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html_tree) + html = sanitize_xhtml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"), + {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html.to_ncr) end end diff --git a/lib/sanitize.rb b/lib/sanitize.rb index 69f8e3e7..32d4afc5 100644 --- a/lib/sanitize.rb +++ b/lib/sanitize.rb @@ -8,19 +8,36 @@ module Sanitize # # sanitize_xhtml() is a case-sensitive sanitizer, suitable for XHTML # sanitize_html() is a case-insensitive sanitizer suitable for HTML +# sanitize_rexml() sanitized a REXML tree, returning a string - require 'html5lib/sanitizer' require 'html5lib/html5parser' require 'html5lib/liberalxmlparser' + + require 'html5lib/treewalkers' + require 'html5lib/serializer' + require 'string_utils' + require 'html5lib/sanitizer' + include HTML5lib def sanitize_xhtml(html) - XHTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s + XHTMLParser.parseFragment(html.to_ncr, :tokenizer => HTMLSanitizer).to_s end def sanitize_html(html) HTMLParser.parseFragment(html, :tokenizer => HTMLSanitizer).to_s end + def sanitize_rexml(tree) + tokens = TreeWalkers.getTreeWalker('rexml').new(tree.to_ncr) + HTMLSerializer.serialize(tokens, {:encoding=>'utf-8', + :quote_attr_values => 'true', + :minimize_boolean_attributes => 'false', + :use_trailing_solidus => 'true', + :space_before_trailing_solidus => 'true', + :omit_optional_tags => 'false', + :inject_meta_charset => 'false', + :sanitize => 'true'}) + end end diff --git a/lib/string_utils.rb b/lib/string_utils.rb index 04ec2b2c..e3059a6c 100644 --- a/lib/string_utils.rb +++ b/lib/string_utils.rb @@ -2155,3 +2155,20 @@ class String end end + +require 'rexml/element' +module REXML + class Element + def to_ncr + XPath.each(self, '//*') { |el| + el.texts.each_index {|i| + el.texts[i].value = el.texts[i].to_s.to_ncr + } + el.attributes.each { |name,val| + el.attributes[name] = val.to_ncr + } + } + return self + end + end +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/constants.rb b/vendor/plugins/HTML5lib/lib/html5lib/constants.rb index c0c3dc3f..b28a6f01 100755 --- a/vendor/plugins/HTML5lib/lib/html5lib/constants.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/constants.rb @@ -148,6 +148,26 @@ module HTML5lib input ] + BOOLEAN_ATTRIBUTES = { + :global => %w[irrelevant], + 'style' => %w[scoped], + 'img' => %w[ismap], + 'audio' => %w[autoplay controls], + 'video' => %w[autoplay controls], + 'script' => %w[defer async], + 'details' => %w[open], + 'datagrid' => %w[multiple disabled], + 'command' => %w[hidden disabled checked default], + 'menu' => %w[autosubmit], + 'fieldset' => %w[disabled readonly], + 'option' => %w[disabled readonly selected], + 'optgroup' => %w[disabled readonly], + 'button' => %w[disabled autofocus], + 'input' => %w[disabled readonly required autofocus checked ismap], + 'select' => %w[disabled readonly autofocus multiple], + 'output' => %w[disabled readonly] + } + # entitiesWindows1252 has to be _ordered_ and needs to have an index. ENTITIES_WINDOWS1252 = [ 8364, # 0x80 0x20AC EURO SIGN diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb b/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb index 178ed574..7de4dfba 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb @@ -37,13 +37,13 @@ module HTML5lib # :strict - raise an exception when a parse error is encountered # :tree - a treebuilder class controlling the type of tree that will be # returned. Built in treebuilders can be accessed through - # html5lib.treebuilders.getTreeBuilder(treeType) + # HTML5lib::TreeBuilders[treeType] def initialize(options = {}) @strict = false @errors = [] @tokenizer = HTMLTokenizer - @tree = TreeBuilders::REXMLTree::TreeBuilder + @tree = TreeBuilders::REXML::TreeBuilder options.each { |name, value| instance_variable_set("@#{name}", value) } diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb index c4b86039..808ac03c 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb @@ -107,4 +107,4 @@ module HTML5lib end end -end \ No newline at end of file +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb index 3a96b66f..6a271504 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb @@ -153,4 +153,4 @@ module HTML5lib end end -end \ No newline at end of file +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb b/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb index 62cc9948..2f11e2d8 100755 --- a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb @@ -58,7 +58,7 @@ module HTML5lib unless @char_encoding == 'utf-8' begin require 'iconv' - uString = Iconv.iconv('utf-8', @encoding, uString)[0] + uString = Iconv.iconv('utf-8', @char_encoding, uString)[0] rescue end end @@ -95,11 +95,13 @@ module HTML5lib #First look for a BOM #This will also read past the BOM if present encoding = detect_bom + #If there is no BOM need to look for meta elements with encoding #information if encoding.nil? and @parse_meta encoding = detect_encoding_meta end + #Guess with chardet, if avaliable if encoding.nil? and @chardet begin @@ -111,13 +113,14 @@ module HTML5lib rescue LoadError end end + # If all else fails use the default encoding if encoding.nil? encoding = @DEFAULT_ENCODING end #Substitute for equivalent encodings: - encoding_sub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'} + encoding_sub = {'iso-8859-1' => 'windows-1252'} if encoding_sub.has_key?(encoding.downcase) encoding = encoding_sub[encoding.downcase] @@ -132,10 +135,10 @@ module HTML5lib def detect_bom bom_dict = { "\xef\xbb\xbf" => 'utf-8', - "\xff\xfe" => 'utf-16-le', - "\xfe\xff" => 'utf-16-be', - "\xff\xfe\x00\x00" => 'utf-32-le', - "\x00\x00\xfe\xff" => 'utf-32-be' + "\xff\xfe" => 'utf16le', + "\xfe\xff" => 'utf16be', + "\xff\xfe\x00\x00" => 'utf32le', + "\x00\x00\xfe\xff" => 'utf32be' } # Go to beginning of file and read in 4 bytes @@ -205,7 +208,17 @@ module HTML5lib else begin @tell += 1 - return @data_stream[@tell - 1].chr + c = @data_stream[@tell - 1] + case c + when 0xC2 .. 0xDF + @tell += 1 + c.chr + @data_stream[@tell-1].chr + when 0xE0 .. 0xF0 + @tell += 2 + c.chr + @data_stream[@tell-2].chr + @data_stream[@tell-1].chr + else + c.chr + end rescue return :EOF end @@ -227,8 +240,8 @@ module HTML5lib else # Then the rest begin - char_stack.push(@data_stream[@tell].chr) @tell += 1 + char_stack.push(@data_stream[@tell-1].chr) rescue char_stack.push(:EOF) break diff --git a/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb b/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb index 6f7cdcac..9168ba4d 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb @@ -1,4 +1,3 @@ -require 'html5lib/tokenizer' require 'cgi' module HTML5lib @@ -6,7 +5,7 @@ module HTML5lib # This module provides sanitization of XHTML+MathML+SVG # and of inline style attributes. - class HTMLSanitizer < HTMLTokenizer + module HTMLSanitizeModule ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt @@ -96,19 +95,7 @@ module HTML5lib ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS - # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and - # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style - # attributes are parsed, and a restricted set, # specified by - # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through. - # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified - # in ALLOWED_PROTOCOLS are allowed. - # - # sanitize_html('') - # => <script> do_nasty_stuff() </script> - # sanitize_html('Click here for $100') - # => Click here for $100 - def each - super do |token| + def process_token(token) case token[:type] when :StartTag, :EndTag, :EmptyTag if ALLOWED_ELEMENTS.include?(token[:name]) @@ -126,7 +113,7 @@ module HTML5lib end token[:data] = attrs.map {|k,v| [k,v]} end - yield token + return token else if token[:type] == :EndTag token[:data] = "" @@ -139,12 +126,11 @@ module HTML5lib token[:data].insert(-2,'/') if token[:type] == :EmptyTag token[:type] = :Characters token.delete(:name) - yield token + return token end else - yield token + return token end - end end def sanitize_css(style) @@ -174,4 +160,23 @@ module HTML5lib style = clean.join(' ') end end + + class HTMLSanitizeFilter < Filter + include HTMLSanitizeModule + def each + @source.each do |token| + yield(process_token(token)) + end + end + end + + class HTMLSanitizer < HTMLTokenizer + include HTMLSanitizeModule + def each + super do |token| + yield(process_token(token)) + end + end + end + end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb b/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb new file mode 100644 index 00000000..1e2ec4cb --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb @@ -0,0 +1,418 @@ +require 'html5lib/constants' +require 'jcode' + +module HTML5lib + +class Filter + include Enumerable + def initialize(source) + @source = source + end +end + +class OptionalTagFilter < Filter + def slider + previous1 = previous2 = nil + @source.each do |token| + yield previous2, previous1, token if previous1 != nil + previous2 = previous1 + previous1 = token + end + yield previous2, previous1, nil + end + + def each + slider do |previous, token, nexttok| + type = token[:type] + if type == :StartTag + yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok) + elsif type == :EndTag + yield token unless is_optional_end(token[:name], nexttok) + else + yield token + end + end + end + + def is_optional_start(tagname, previous, nexttok) + type = nexttok ? nexttok[:type] : nil + if tagname == 'html' + # An html element's start tag may be omitted if the first thing + # inside the html element is not a space character or a comment. + return ![:Comment, :SpaceCharacters].include?(type) + elsif tagname == 'head' + # A head element's start tag may be omitted if the first thing + # inside the head element is an element. + return type == :StartTag + elsif tagname == 'body' + # A body element's start tag may be omitted if the first thing + # inside the body element is not a space character or a comment, + # except if the first thing inside the body element is a script + # or style element and the node immediately preceding the body + # element is a head element whose end tag has been omitted. + if [:Comment, :SpaceCharacters].include?(type) + return false + elsif type == :StartTag + # XXX: we do not look at the preceding event, so we never omit + # the body element's start tag if it's followed by a script or + # a style element. + return !%w[script style].include?(nexttok[:name]) + else + return true + end + elsif tagname == 'colgroup' + # A colgroup element's start tag may be omitted if the first thing + # inside the colgroup element is a col element, and if the element + # is not immediately preceeded by another colgroup element whose + # end tag has been omitted. + if type == :StartTag + # XXX: we do not look at the preceding event, so instead we never + # omit the colgroup element's end tag when it is immediately + # followed by another colgroup element. See is_optional_end. + return nexttok[:name] == "col" + else + return false + end + elsif tagname == 'tbody' + # A tbody element's start tag may be omitted if the first thing + # inside the tbody element is a tr element, and if the element is + # not immediately preceeded by a tbody, thead, or tfoot element + # whose end tag has been omitted. + if type == :StartTag + # omit the thead and tfoot elements' end tag when they are + # immediately followed by a tbody element. See is_optional_end. + if previous and previous[:type] == :EndTag and \ + %w(tbody thead tfoot).include?(previous[:name]) + return false + end + + return nexttok[:name] == 'tr' + else + return false + end + end + return false + end + + def is_optional_end(tagname, nexttok) + type = nexttok ? nexttok[:type] : nil + if %w[html head body].include?(tagname) + # An html element's end tag may be omitted if the html element + # is not immediately followed by a space character or a comment. + return ![:Comment, :SpaceCharacters].include?(type) + elsif %w[li optgroup option tr].include?(tagname) + # A li element's end tag may be omitted if the li element is + # immediately followed by another li element or if there is + # no more content in the parent element. + # An optgroup element's end tag may be omitted if the optgroup + # element is immediately followed by another optgroup element, + # or if there is no more content in the parent element. + # An option element's end tag may be omitted if the option + # element is immediately followed by another option element, + # or if there is no more content in the parent element. + # A tr element's end tag may be omitted if the tr element is + # immediately followed by another tr element, or if there is + # no more content in the parent element. + if type == :StartTag + return nexttok[:name] == tagname + else + return type == :EndTag || type == nil + end + elsif %w(dt dd).include?(tagname) + # A dt element's end tag may be omitted if the dt element is + # immediately followed by another dt element or a dd element. + # A dd element's end tag may be omitted if the dd element is + # immediately followed by another dd element or a dt element, + # or if there is no more content in the parent element. + if type == :StartTag + return %w(dt dd).include?(nexttok[:name]) + elsif tagname == 'dd' + return type == :EndTag || type == nil + else + return false + end + elsif tagname == 'p' + # A p element's end tag may be omitted if the p element is + # immediately followed by an address, blockquote, dl, fieldset, + # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table, + # or ul element, or if there is no more content in the parent + # element. + if type == :StartTag + return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5 + h6 hr menu ol p pre table ul).include?(nexttok[:name]) + else + return type == :EndTag || type == nil + end + elsif tagname == 'colgroup' + # A colgroup element's end tag may be omitted if the colgroup + # element is not immediately followed by a space character or + # a comment. + if [:Comment, :SpaceCharacters].include?(type) + return false + elsif type == :StartTag + # XXX: we also look for an immediately following colgroup + # element. See is_optional_start. + return nexttok[:name] != 'colgroup' + else + return true + end + elsif %w(thead tbody).include? tagname + # A thead element's end tag may be omitted if the thead element + # is immediately followed by a tbody or tfoot element. + # A tbody element's end tag may be omitted if the tbody element + # is immediately followed by a tbody or tfoot element, or if + # there is no more content in the parent element. + # A tfoot element's end tag may be omitted if the tfoot element + # is immediately followed by a tbody element, or if there is no + # more content in the parent element. + # XXX: we never omit the end tag when the following element is + # a tbody. See is_optional_start. + if type == :StartTag + return %w(tbody tfoot).include?(nexttok[:name]) + elsif tagname == 'tbody' + return (type == :EndTag or type == nil) + else + return false + end + elsif tagname == 'tfoot' + # A tfoot element's end tag may be omitted if the tfoot element + # is immediately followed by a tbody element, or if there is no + # more content in the parent element. + # XXX: we never omit the end tag when the following element is + # a tbody. See is_optional_start. + if type == :StartTag + return nexttok[:name] == 'tbody' + else + return type == :EndTag || type == nil + end + elsif %w(td th).include? tagname + # A td element's end tag may be omitted if the td element is + # immediately followed by a td or th element, or if there is + # no more content in the parent element. + # A th element's end tag may be omitted if the th element is + # immediately followed by a td or th element, or if there is + # no more content in the parent element. + if type == :StartTag + return %w(td th).include?(nexttok[:name]) + else + return type == :EndTag || type == nil + end + end + return false + end +end + +class HTMLSerializer + CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript] + + def self.serialize(stream, options = {}) + new(options).serialize(stream) + end + + def initialize(options={}) + @quote_attr_values = false + @quote_char = '"' + @use_best_quote_char = true + @minimize_boolean_attributes = true + + @use_trailing_solidus = false + @space_before_trailing_solidus = true + + @omit_optional_tags = true + @sanitize = false + + @strip_whitespace = false + + @inject_meta_charset = true + + options.each do |name, value| + next unless %w(quote_attr_values quote_char use_best_quote_char + minimize_boolean_attributes use_trailing_solidus + space_before_trailing_solidus omit_optional_tags sanitize + strip_whitespace inject_meta_charset).include? name.to_s + @use_best_quote_char = false if name.to_s == 'quote_char' + instance_variable_set("@#{name}", value) + end + + @errors = [] + end + + def serialize(treewalker, encoding=nil) + in_cdata = false + @errors = [] + if encoding and @inject_meta_charset + treewalker = filter_inject_meta_charset(treewalker, encoding) + end + if @strip_whitespace + treewalker = filter_whitespace(treewalker) + end + if @sanitize + require 'html5lib/sanitizer' + treewalker = HTMLSanitizeFilter.new(treewalker) + end +# if @omit_optional_tags +# treewalker = OptionalTagFilter.new(treewalker) +# end + + result = [] + treewalker.each do |token| + type = token[:type] + if type == :Doctype + doctype = "" % token[:name] + if encoding + result << doctype.encode(encoding) + else + result << doctype + end + + elsif [:Characters, :SpaceCharacters].include? type + if type == :SpaceCharacters or in_cdata + if in_cdata and token[:data].find("= 0 + serializeError(_("Unexpected ", ">") + end + + elsif [:StartTag, :EmptyTag].include? type + name = token[:name] + if CDATA_ELEMENTS.include?(name) + in_cdata = true + elsif in_cdata + serializeError(_("Unexpected child element of a CDATA element")) + end + attrs = token[:data].to_a + attrs.sort() + attributes = [] + for k,v in attrs + if encoding + k = k.encode(encoding) + end + attributes << ' ' + + attributes << k + if not @minimize_boolean_attributes or \ + (!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \ + and !BOOLEAN_ATTRIBUTES[:global].include?(k)) + attributes << "=" + if @quote_attr_values or v.empty? + quote_attr = true + else + quote_attr = (SPACE_CHARACTERS.join('') + "<>\"'").each_char.any? {|c| v.include?(c)} + end + v = v.gsub("&", "&") + if encoding + v = v.encode(encoding, unicode_encode_errors) + end + if quote_attr + quote_char = @quote_char + if @use_best_quote_char + if v.index("'") and !v.index('"') + quote_char = '"' + elsif v.index('"') and !v.index("'") + quote_char = "'" + end + end + if quote_char == "'" + v = v.gsub("'", "'") + else + v = v.gsub('"', """) + end + attributes << quote_char << v << quote_char + else + attributes << v + end + end + end + if VOID_ELEMENTS.include?(name) and @use_trailing_solidus + if @space_before_trailing_solidus + attributes << " /" + else + attributes << "/" + end + end + if encoding + result << "<%s%s>" % [name.encode(encoding), attributes.join('')] + else + result << "<%s%s>" % [name, attributes.join('')] + end + + elsif type == :EndTag + name = token[:name] + if CDATA_ELEMENTS.include?(name) + in_cdata = false + elsif in_cdata + serializeError(_("Unexpected child element of a CDATA element")) + end + end_tag = "" % name + if encoding + end_tag = end_tag.encode(encoding) + end + result << end_tag + + elsif type == :Comment + data = token[:data] + if data.index("--") + serializeError(_("Comment contains --")) + end + comment = "" % token[:data] + if encoding + comment = comment.encode(encoding, unicode_encode_errors) + end + result << comment + + else + serializeError(token[:data]) + end + end + result.join('') + end + + def render(treewalker, encoding=nil) + if encoding + return "".join(list(serialize(treewalker, encoding))) + else + return "".join(list(serialize(treewalker))) + end + end + + def serializeError(data="XXX ERROR MESSAGE NEEDED") + # XXX The idea is to make data mandatory. + @errors.push(data) + if @strict + raise SerializeError + end + end + + def filter_inject_meta_charset(treewalker, encoding) + done = false + for token in treewalker + if not done and token[:type] == :StartTag \ + and token[:name].lower() == "head" + yield({:type => :EmptyTag, :name => "meta", \ + :data => {"charset" => encoding}}) + end + yield token + end + end + + def filter_whitespace(treewalker) + raise NotImplementedError + end +end + +# Error in serialized tree +class SerializeError < Exception +end +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb index 176b402a..9fa49975 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb @@ -1,21 +1,24 @@ module HTML5lib module TreeBuilders - def self.getTreeBuilder(name) - case name.to_s.downcase + class << self + def [](name) + case name.to_s.downcase when 'simpletree' then require 'html5lib/treebuilders/simpletree' SimpleTree::TreeBuilder when 'rexml' then require 'html5lib/treebuilders/rexml' - REXMLTree::TreeBuilder + REXML::TreeBuilder when 'hpricot' then require 'html5lib/treebuilders/hpricot' Hpricot::TreeBuilder else raise "Unknown TreeBuilder #{name}" + end end - end + alias :getTreeBuilder :[] + end end end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb index 5c1be892..0d1082bd 100755 --- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb @@ -144,7 +144,7 @@ module HTML5lib # code. It should still do the same though. # Step 1: stop the algorithm when there's nothing to do. - return unless @activeFormattingElements + return if @activeFormattingElements.empty? # Step 2 and step 3: we start with the last element. So i is -1. i = -1 diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb index 3ea8afa2..fc120827 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb @@ -1,4 +1,5 @@ require 'html5lib/treebuilders/base' +require 'rubygems' require 'hpricot' require 'forwardable' @@ -26,12 +27,14 @@ module HTML5lib childNodes << node hpricot.children << node.hpricot end + node.hpricot.parent = hpricot node.parent = self end def removeChild(node) childNodes.delete(node) hpricot.children.delete_at(hpricot.children.index(node.hpricot)) + node.hpricot.parent = nil node.parent = nil end @@ -48,6 +51,7 @@ module HTML5lib if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode) childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s else + refNode.hpricot.parent.insert_before(node.hpricot,refNode.hpricot) childNodes.insert(index, node) end end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb index 7c389ca6..f6aad877 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb @@ -4,7 +4,7 @@ require 'forwardable' module HTML5lib module TreeBuilders - module REXMLTree + module REXML class Node < Base::Node extend Forwardable @@ -52,6 +52,7 @@ module HTML5lib childNodes[index-1].rxobj.raw = true else childNodes.insert index, node + refNode.rxobj.parent.insert_before(refNode.rxobj,node.rxobj) end end @@ -62,7 +63,7 @@ module HTML5lib class Element < Node def self.rxclass - REXML::Element + ::REXML::Element end def initialize name @@ -95,7 +96,7 @@ module HTML5lib class Document < Node def self.rxclass - REXML::Document + ::REXML::Document end def initialize @@ -120,7 +121,7 @@ module HTML5lib class DocumentType < Node def self.rxclass - REXML::DocType + ::REXML::DocType end def printTree indent=0 @@ -145,7 +146,7 @@ module HTML5lib class TextNode < Node def initialize data raw=data.gsub('&','&').gsub('<','<').gsub('>','>') - @rxobj = REXML::Text.new(raw, true, nil, true) + @rxobj = ::REXML::Text.new(raw, true, nil, true) end def printTree indent=0 @@ -155,7 +156,7 @@ module HTML5lib class CommentNode < Node def self.rxclass - REXML::Comment + ::REXML::Comment end def printTree indent=0 diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb new file mode 100644 index 00000000..2074768c --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers.rb @@ -0,0 +1,26 @@ +require 'html5lib/treewalkers/base' + +module HTML5lib + module TreeWalkers + + class << self + def [](name) + case name.to_s.downcase + when 'simpletree' then + require 'html5lib/treewalkers/simpletree' + SimpleTree::TreeWalker + when 'rexml' then + require 'html5lib/treewalkers/rexml' + REXML::TreeWalker + when 'hpricot' then + require 'html5lib/treewalkers/hpricot' + Hpricot::TreeWalker + else + raise "Unknown TreeWalker #{name}" + end + end + + alias :getTreeWalker :[] + end + end +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb new file mode 100644 index 00000000..64c280df --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb @@ -0,0 +1,156 @@ +require 'html5lib/constants' +module HTML5lib +module TreeWalkers + +module TokenConstructor + def error(msg) + return {:type => "SerializeError", :data => msg} + end + + def normalizeAttrs(attrs) + attrs.to_a + end + + def emptyTag(name, attrs, hasChildren=false) + error(_("Void element has children")) if hasChildren + return({:type => :EmptyTag, :name => name, \ + :data => normalizeAttrs(attrs)}) + end + + def startTag(name, attrs) + return {:type => :StartTag, :name => name, \ + :data => normalizeAttrs(attrs)} + end + + def endTag(name) + return {:type => :EndTag, :name => name, :data => []} + end + + def text(data) + if data =~ /^([#{SPACE_CHARACTERS.join('')}]+)/ + yield({:type => :SpaceCharacters, :data => $1}) + data = data[$1.length .. -1] + return if data.empty? + end + + if data =~ /([#{SPACE_CHARACTERS.join('')}]+)$/ + yield({:type => :Characters, :data => data[0 ... -$1.length]}) + yield({:type => :SpaceCharacters, :data => $1}) + else + yield({:type => :Characters, :data => data}) + end + end + + def comment(data) + return {:type => :Comment, :data => data} + end + + def doctype(name) + return {:type => :Doctype, :name => name, :data => name.upcase() == "HTML"} + end + + def unknown(nodeType) + return error(_("Unknown node type: ") + nodeType.to_s) + end + + def _(str) + str + end +end + +class Base + include TokenConstructor + + def initialize(tree) + @tree = tree + end + + def each + raise NotImplementedError + end + + alias walk each +end + +class NonRecursiveTreeWalker < TreeWalkers::Base + def node_details(node) + raise NotImplementedError + end + + def first_child(node) + raise NotImplementedError + end + + def next_sibling(node) + raise NotImplementedError + end + + def parent(node) + raise NotImplementedError + end + + def each + currentNode = @tree + while currentNode != nil + details = node_details(currentNode) + hasChildren = false + + case details.shift + when :DOCTYPE + yield doctype(*details) + + when :TEXT + text(*details) {|token| yield token} + + when :ELEMENT + name, attributes, hasChildren = details + if VOID_ELEMENTS.include?(name) + yield emptyTag(name, attributes.to_a, hasChildren) + hasChildren = false + else + yield startTag(name, attributes.to_a) + end + + when :COMMENT + yield comment(details[0]) + + when :DOCUMENT, :DOCUMENT_FRAGMENT + hasChildren = true + + when nil + # ignore (REXML::XMLDecl is an example) + + else + yield unknown(details[0]) + end + + firstChild = hasChildren ? first_child(currentNode) : nil + if firstChild != nil + currentNode = firstChild + else + while currentNode != nil + details = node_details(currentNode) + if details.shift == :ELEMENT + name, attributes, hasChildren = details + yield endTag(name) if !VOID_ELEMENTS.include?(name) + end + + if @tree == currentNode + currentNode = nil + else + nextSibling = next_sibling(currentNode) + if nextSibling != nil + currentNode = nextSibling + break + end + + currentNode = parent(currentNode) + end + end + end + end + end +end + +end +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb new file mode 100644 index 00000000..bf129891 --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb @@ -0,0 +1,48 @@ +require 'html5lib/treewalkers/base' +require 'rexml/document' + +module HTML5lib + module TreeWalkers + module Hpricot + class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker + + def node_details(node) + case node + when ::Hpricot::Elem + if !node.name + [:DOCUMENT_FRAGMENT] + else + [:ELEMENT, node.name, + node.attributes.map {|name,value| [name,value]}, + !node.empty?] + end + when ::Hpricot::Text + [:TEXT, node.to_plain_text] + when ::Hpricot::Comment + [:COMMENT, node.content] + when ::Hpricot::Doc + [:DOCUMENT] + when ::Hpricot::DocType + [:DOCTYPE, node.target] + when ::Hpricot::XMLDecl + [nil] + else + [:UNKNOWN, node.class.inspect] + end + end + + def first_child(node) + node.children.first + end + + def next_sibling(node) + node.next_node + end + + def parent(node) + node.parent + end + end + end + end +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb new file mode 100644 index 00000000..c6881d97 --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/rexml.rb @@ -0,0 +1,48 @@ +require 'html5lib/treewalkers/base' +require 'rexml/document' + +module HTML5lib + module TreeWalkers + module REXML + class TreeWalker < HTML5lib::TreeWalkers::NonRecursiveTreeWalker + + def node_details(node) + case node + when ::REXML::Document + [:DOCUMENT] + when ::REXML::Element + if !node.name + [:DOCUMENT_FRAGMENT] + else + [:ELEMENT, node.name, + node.attributes.map {|name,value| [name,value]}, + node.has_elements? || node.has_text?] + end + when ::REXML::Text + [:TEXT, node.value] + when ::REXML::Comment + [:COMMENT, node.string] + when ::REXML::DocType + [:DOCTYPE, node.name] + when ::REXML::XMLDecl + [nil] + else + [:UNKNOWN, node.class.inspect] + end + end + + def first_child(node) + node.children.first + end + + def next_sibling(node) + node.next_sibling + end + + def parent(node) + node.parent + end + end + end + end +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb new file mode 100644 index 00000000..37ebf32a --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/simpletree.rb @@ -0,0 +1,48 @@ +require 'html5lib/treewalkers/base' + +module HTML5lib + module TreeWalkers + module SimpleTree + class TreeWalker < HTML5lib::TreeWalkers::Base + include HTML5lib::TreeBuilders::SimpleTree + + def walk(node) + case node + when Document, DocumentFragment + return + + when DocumentType + yield doctype(node.name) + + when TextNode + text(node.value) {|token| yield token} + + when Element + if VOID_ELEMENTS.include?(node.name) + yield emptyTag(node.name, node.attributes, node.hasContent()) + else + yield startTag(node.name, node.attributes) + for child in node.childNodes + walk(child) {|token| yield token} + end + yield endTag(node.name) + end + + when CommentNode + yield comment(node.value) + + else + puts '?' + yield unknown(node.class) + end + end + + def each + for child in @tree.childNodes + walk(child) {|node| yield node} + end + end + end + end + end +end diff --git a/vendor/plugins/HTML5lib/parse.rb b/vendor/plugins/HTML5lib/parse.rb new file mode 100755 index 00000000..845d3726 --- /dev/null +++ b/vendor/plugins/HTML5lib/parse.rb @@ -0,0 +1,137 @@ +#!/usr/bin/env ruby +# +# Parse a document to a simpletree tree, with optional profiling + +$:.unshift File.dirname(__FILE__),'lib' + +def parse(opts, args) + + f = args[-1] + if f + begin + require 'open-uri' if f[0..6] == 'http://' + f = open(f) + rescue + end + else + $stderr.write("No filename provided. Use -h for help\n") + exit(1) + end + + require 'html5lib/treebuilders' + treebuilder = HTML5lib::TreeBuilders[opts.treebuilder] + + if opts.output == :xml + require 'html5lib/liberalxmlparser' + p = HTML5lib::XHTMLParser.new(:tree=>treebuilder) + else + require 'html5lib/html5parser' + p = HTML5lib::HTMLParser.new(:tree=>treebuilder) + end + + if opts.profile + require 'profiler' + Profiler__::start_profile + p.parse(f) + Profiler__::stop_profile + Profiler__::print_profile($stderr) + elsif opts.time + require 'time' + t0 = Time.new + document = p.parse(f) + t1 = Time.new + printOutput(p, document, opts) + t2 = Time.new + puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1] + else + document = p.parse(f) + printOutput(p, document, opts) + end +end + +def printOutput(parser, document, opts) + puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding + + case opts.output + when :xml + print document + when :html + require 'html5lib/treewalkers' + tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document) + require 'html5lib/serializer' + print HTML5lib::HTMLSerializer.serialize(tokens, :encoding=>'utf-8') + when :hilite + print document.hilite + when :tree + print parser.tree.testSerializer(document) + end + + if opts.error + errList=[] + for pos, message in parser.errors + errList << ("Line %i Col %i"%pos + " " + message) + end + $stderr.write("\nParse errors:\n" + errList.join("\n")+"\n") + end +end + +require 'ostruct' +options = OpenStruct.new +options.profile = false +options.time = false +options.output = :tree +options.treebuilder = 'simpletree' +options.error = false +options.encoding = false + +require 'optparse' +opts = OptionParser.new do |opts| + opts.on("-p", "--[no-]profile", "Profile the run") do |profile| + options.profile = profile + end + + opts.on("-t", "--[no-]time", "Time the run") do |time| + options.time = time + end + + opts.on("--[no-]tree", "Do not print output tree") do |tree| + if tree + options.output = :tree + else + options.output = nil + end + end + + opts.on("-b", "--treebuilder NAME") do |treebuilder| + options.treebuilder = treebuilder + end + + opts.on("-e", "--error", "Print a list of parse errors") do |error| + options.error = error + end + + opts.on("-x", "--xml", "output as xml") do |xml| + options.output = :xml + options.treebuilder = "rexml" + end + + opts.on("--html", "Output as html") do |html| + options.output = :html + end + + opts.on("--hilite", "Output as formatted highlighted code.") do |hilite| + options.output = :hilite + end + + opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding| + options.encoding = encoding + end + + opts.on_tail("-h", "--help", "Show this message") do + puts opts + exit + end +end + +opts.parse!(ARGV) +parse options, ARGV diff --git a/vendor/plugins/HTML5lib/tests/preamble.rb b/vendor/plugins/HTML5lib/tests/preamble.rb index 6e2d5a27..164be8b1 100644 --- a/vendor/plugins/HTML5lib/tests/preamble.rb +++ b/vendor/plugins/HTML5lib/tests/preamble.rb @@ -21,3 +21,53 @@ rescue LoadError end end end + +module HTML5lib + module TestSupport + def self.startswith?(a, b) + b[0... a.length] == a + end + + def self.parseTestcase(data) + innerHTML = nil + input = [] + output = [] + errors = [] + currentList = input + data.split(/\n/).each do |line| + if !line.empty? and !startswith?("#errors", line) and + !startswith?("#document", line) and + !startswith?("#data", line) and + !startswith?("#document-fragment", line) + + if currentList == output and startswith?("|", line) + currentList.push(line[2..-1]) + else + currentList.push(line) + end + elsif line == "#errors" + currentList = errors + elsif line == "#document" or startswith?("#document-fragment", line) + if startswith?("#document-fragment", line) + innerHTML = line[19..-1] + raise AssertionError unless innerHTML + end + currentList = output + end + end + return innerHTML, input.join("\n"), output.join("\n"), errors + end + + # convert the output of str(document) to the format used in the testcases + def convertTreeDump(treedump) + treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n") + end + + def sortattrs(output) + output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match| + match.split("\n").sort.join("\n") + end + end + + end +end diff --git a/vendor/plugins/HTML5lib/tests/test_encoding.rb b/vendor/plugins/HTML5lib/tests/test_encoding.rb index 384887c7..dd6e52af 100755 --- a/vendor/plugins/HTML5lib/tests/test_encoding.rb +++ b/vendor/plugins/HTML5lib/tests/test_encoding.rb @@ -4,33 +4,33 @@ require 'html5lib/inputstream' class Html5EncodingTestCase < Test::Unit::TestCase -begin + begin require 'rubygems' require 'UniversalDetector' def test_chardet - File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file| - stream = HTML5lib::HTMLInputStream.new(file, :chardet => true) - assert_equal 'big5', stream.char_encoding.downcase - end + File.open(File.join(HTML5LIB_BASE, 'tests', 'encoding', 'chardet', 'test_big5.txt')) do |file| + stream = HTML5lib::HTMLInputStream.new(file, :chardet => true) + assert_equal 'big5', stream.char_encoding.downcase + end end -rescue LoadError + rescue LoadError puts "chardet not found, skipping chardet tests" -end + end - html5lib_test_files('encoding').each do |test_file| - test_name = File.basename(test_file).sub('.dat', '').tr('-', '') + html5lib_test_files('encoding').each do |test_file| + test_name = File.basename(test_file).sub('.dat', '').tr('-', '') - File.read(test_file).split("#data\n").each_with_index do |data, index| - next if data.empty? - input, encoding = data.split(/\n#encoding\s+/, 2) - encoding = encoding.split[0] + File.read(test_file).split("#data\n").each_with_index do |data, index| + next if data.empty? + input, encoding = data.split(/\n#encoding\s+/, 2) + encoding = encoding.split[0] - define_method 'test_%s_%d' % [ test_name, index + 1 ] do - stream = HTML5lib::HTMLInputStream.new(input, :chardet => false) - assert_equal encoding.downcase, stream.char_encoding.downcase, input - end - end + define_method 'test_%s_%d' % [ test_name, index + 1 ] do + stream = HTML5lib::HTMLInputStream.new(input, :chardet => false) + assert_equal encoding.downcase, stream.char_encoding.downcase, input + end end + end end diff --git a/vendor/plugins/HTML5lib/tests/test_parser.rb b/vendor/plugins/HTML5lib/tests/test_parser.rb index ab26cb07..61813825 100644 --- a/vendor/plugins/HTML5lib/tests/test_parser.rb +++ b/vendor/plugins/HTML5lib/tests/test_parser.rb @@ -14,53 +14,12 @@ end $CHECK_PARSER_ERRORS = false -puts 'Testing: ' + $tree_types_to_test * ', ' +puts 'Testing tree builders: ' + $tree_types_to_test * ', ' class Html5ParserTestCase < Test::Unit::TestCase - - def self.startswith?(a, b) - b[0... a.length] == a - end - - def self.parseTestcase(data) - innerHTML = nil - input = [] - output = [] - errors = [] - currentList = input - data.split(/\n/).each do |line| - if !line.empty? and !startswith?("#errors", line) and - !startswith?("#document", line) and - !startswith?("#data", line) and - !startswith?("#document-fragment", line) - - if currentList == output and startswith?("|", line) - currentList.push(line[2..-1]) - else - currentList.push(line) - end - elsif line == "#errors" - currentList = errors - elsif line == "#document" or startswith?("#document-fragment", line) - if startswith?("#document-fragment", line) - innerHTML = line[19..-1] - raise AssertionError unless innerHTML - end - currentList = output - end - end - return innerHTML, input.join("\n"), output.join("\n"), errors - end - - # convert the output of str(document) to the format used in the testcases - def convertTreeDump(treedump) - treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n") - end - - def sortattrs(output) - output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) { |match| match.split("\n").sort.join("\n") } - end + include HTML5lib + include TestSupport html5lib_test_files('tree-construction').each do |test_file| @@ -69,12 +28,13 @@ class Html5ParserTestCase < Test::Unit::TestCase File.read(test_file).split("#data\n").each_with_index do |data, index| next if data.empty? - innerHTML, input, expected_output, expected_errors = parseTestcase(data) + innerHTML, input, expected_output, expected_errors = + TestSupport.parseTestcase(data) $tree_types_to_test.each do |tree_name| define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do - parser = HTML5lib::HTMLParser.new(:tree => HTML5lib::TreeBuilders.getTreeBuilder(tree_name)) + parser = HTMLParser.new(:tree => TreeBuilders[tree_name]) if innerHTML parser.parseFragment(input, innerHTML) diff --git a/vendor/plugins/HTML5lib/tests/test_sanitizer.rb b/vendor/plugins/HTML5lib/tests/test_sanitizer.rb index 3b440071..b8d6fc57 100644 --- a/vendor/plugins/HTML5lib/tests/test_sanitizer.rb +++ b/vendor/plugins/HTML5lib/tests/test_sanitizer.rb @@ -2,9 +2,11 @@ require File.join(File.dirname(__FILE__), 'preamble') -require 'html5lib/sanitizer' require 'html5lib/html5parser' require 'html5lib/liberalxmlparser' +require 'html5lib/treewalkers' +require 'html5lib/serializer' +require 'html5lib/sanitizer' class SanitizeTest < Test::Unit::TestCase include HTML5lib diff --git a/vendor/plugins/HTML5lib/tests/test_serializer.rb b/vendor/plugins/HTML5lib/tests/test_serializer.rb new file mode 100644 index 00000000..4224e34a --- /dev/null +++ b/vendor/plugins/HTML5lib/tests/test_serializer.rb @@ -0,0 +1,52 @@ +require File.join(File.dirname(__FILE__), 'preamble') + +require 'html5lib/html5parser' +require 'html5lib/serializer' +require 'html5lib/treewalkers' + +#Run the serialize error checks +checkSerializeErrors = false + +class JsonWalker < HTML5lib::TreeWalkers::Base + def each + @tree.each do |token| + case token[0] + when 'StartTag' + yield startTag(token[1], token[2]) + when 'EndTag' + yield endTag(token[1]) + when 'EmptyTag' + yield emptyTag(token[1], token[2]) + when 'Comment' + yield comment(token[1]) + when 'Characters', 'SpaceCharacters' + text(token[1]) {|textToken| yield textToken} + when 'Doctype' + yield doctype(token[1]) + else + raise ValueError("Unknown token type: " + type) + end + end + end +end + +class Html5SerializeTestcase < Test::Unit::TestCase + html5lib_test_files('serializer').each do |filename| + test_name = File.basename(filename).sub('.test', '') + tests = JSON::parse(open(filename).read) + tests['tests'].each_with_index do |test, index| + + define_method "test_#{test_name}_#{index+1}" do + result = HTML5lib::HTMLSerializer. + serialize(JsonWalker.new(test["input"]), (test["options"] || {})) + expected = test["expected"] + if expected.length == 1 + assert_equal(expected[0], result, test["description"]) + elsif !expected.include?(result) + flunk("Expected: #{expected.inspect}, Received: #{result.inspect}") + end + end + + end + end +end diff --git a/vendor/plugins/HTML5lib/tests/test_stream.rb b/vendor/plugins/HTML5lib/tests/test_stream.rb new file mode 100755 index 00000000..e2d6fe78 --- /dev/null +++ b/vendor/plugins/HTML5lib/tests/test_stream.rb @@ -0,0 +1,54 @@ +require File.join(File.dirname(__FILE__), 'preamble') + +require 'html5lib/inputstream' + +class HTMLInputStreamTest < Test::Unit::TestCase + include HTML5lib + + def test_char_ascii + stream = HTMLInputStream.new("'") + assert_equal('ascii', stream.char_encoding) + assert_equal("'", stream.char) + end + + def test_char_null + stream = HTMLInputStream.new("\x00") + assert_equal("\xef\xbf\xbd", stream.char) + end + + def test_char_utf8 + stream = HTMLInputStream.new("\xe2\x80\x98") + assert_equal('utf-8', stream.char_encoding) + assert_equal("\xe2\x80\x98", stream.char) + end + + def test_char_win1252 + stream = HTMLInputStream.new("\x91") + assert_equal('windows-1252', stream.char_encoding) + assert_equal("\xe2\x80\x98", stream.char) + end + + def test_bom + stream = HTMLInputStream.new("\xef\xbb\xbf" + "'") + assert_equal('utf-8', stream.char_encoding) + assert_equal("'", stream.char) + end + + def test_utf_16 + stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025) + assert(stream.char_encoding, 'utf-16-le') + assert_equal(1025, stream.chars_until(' ',true).length) + end + + def test_newlines + stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd") + assert_equal(0, stream.instance_eval {@tell}) + assert_equal("a\nbb\n", stream.chars_until('c')) + assert_equal(6, stream.instance_eval {@tell}) + assert_equal([3,1], stream.position) + assert_equal("ccc\ndddd", stream.chars_until('x')) + assert_equal(14, stream.instance_eval {@tell}) + assert_equal([4,5], stream.position) + assert_equal([0,1,4,8], stream.instance_eval {@new_lines}) + end +end diff --git a/vendor/plugins/HTML5lib/tests/test_treewalkers.rb b/vendor/plugins/HTML5lib/tests/test_treewalkers.rb new file mode 100644 index 00000000..9fcaa502 --- /dev/null +++ b/vendor/plugins/HTML5lib/tests/test_treewalkers.rb @@ -0,0 +1,110 @@ +require File.join(File.dirname(__FILE__), 'preamble') + +require 'html5lib/html5parser' +require 'html5lib/treewalkers' +require 'html5lib/treebuilders' + +$tree_types_to_test = { + 'simpletree' => + {:builder => HTML5lib::TreeBuilders['simpletree'], + :walker => HTML5lib::TreeWalkers['simpletree']}, + 'rexml' => + {:builder => HTML5lib::TreeBuilders['rexml'], + :walker => HTML5lib::TreeWalkers['rexml']}, +# 'hpricot' => +# {:builder => HTML5lib::TreeBuilders['hpricot'], +# :walker => HTML5lib::TreeWalkers['hpricot']}, +} + +puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', ' + +class TestTreeWalkers < Test::Unit::TestCase + include HTML5lib::TestSupport + + def concatenateCharacterTokens(tokens) + charactersToken = nil + for token in tokens + type = token[:type] + if [:Characters, :SpaceCharacters].include?(type) + if charactersToken == nil + charactersToken = {:type => :Characters, :data => token[:data]} + else + charactersToken[:data] += token[:data] + end + else + if charactersToken != nil + yield charactersToken + charactersToken = nil + end + yield token + end + end + yield charactersToken if charactersToken != nil + end + + def convertTokens(tokens) + output = [] + indent = 0 + concatenateCharacterTokens(tokens) do |token| + case token[:type] + when :StartTag, :EmptyTag + output << "#{' '*indent}<#{token[:name]}>" + indent += 2 + for name, value in token[:data].to_a.sort + next if name=='xmlns' + output << "#{' '*indent}#{name}=\"#{value}\"" + end + indent -= 2 if token[:type] == :EmptyTag + when :EndTag + indent -= 2 + when :Comment + output << "#{' '*indent}" + when :Doctype + output << "#{' '*indent}" + when :Characters, :SpaceCharacters + output << "#{' '*indent}\"#{token[:data]}\"" + else + # TODO: what to do with errors? + end + end + return output.join("\n") + end + + html5lib_test_files('tree-construction').each do |test_file| + + test_name = File.basename(test_file).sub('.dat', '') + + File.read(test_file).split("#data\n").each_with_index do |data, index| + next if data.empty? + + innerHTML, input, expected_output, expected_errors = + HTML5lib::TestSupport::parseTestcase(data) + + rexml = $tree_types_to_test['rexml'] + $tree_types_to_test.each do |tree_name, treeClass| + + define_method "test_#{test_name}_#{index}_#{tree_name}" do + + parser = HTML5lib::HTMLParser.new(:tree => treeClass[:builder]) + + if innerHTML + parser.parseFragment(input, innerHTML) + else + parser.parse(input) + end + + document = parser.tree.getDocument + + begin + output = sortattrs(convertTokens(treeClass[:walker].new(document))) + expected = sortattrs(expected_output) + errorMsg = "\n\nExpected:\n#{expected}\nRecieved:\n#{output}\n" + assert_equal(expected, output, errorMsg) + rescue NotImplementedError + # Amnesty for those that confess... + end + end + end + end + end +end diff --git a/vendor/plugins/maruku/lib/maruku/output/to_html.rb b/vendor/plugins/maruku/lib/maruku/output/to_html.rb index b5b5e215..82fa3bbb 100644 --- a/vendor/plugins/maruku/lib/maruku/output/to_html.rb +++ b/vendor/plugins/maruku/lib/maruku/output/to_html.rb @@ -154,6 +154,21 @@ Example: CSS: style.css math.css =end + # Render to an HTML fragment (returns a REXML document tree) + def to_html_tree + div = Element.new 'div' + children_to_html.each do |e| + div << e + end + + # render footnotes + if @doc.footnotes_order.size > 0 + div << render_footnotes + end + + doc = Document.new(nil,{:respect_whitespace =>:all}) + doc << div + end # Render to a complete HTML document (returns a REXML document tree) def to_html_document_tree