diff --git a/vendor/plugins/HTML5lib/lib/html5lib/filters/inject_meta_charset.rb b/vendor/plugins/HTML5lib/lib/html5lib/filters/inject_meta_charset.rb index 294796e2..00dc980d 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/filters/inject_meta_charset.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/filters/inject_meta_charset.rb @@ -20,20 +20,43 @@ module HTML5lib when :EmptyTag if token[:name].downcase == "meta" - if token[:data].any? {|name,value| name=='charset'} - # replace charset with actual encoding - attrs=Hash[*token[:data].flatten] - attrs['charset'] = @encoding - token[:data] = attrs.to_a.sort - meta_found = true + # replace charset with actual encoding + token[:data].each_with_index do |(name,value),index| + if name == 'charset' + token[:data][index][1]=@encoding + meta_found = true + end + end + + # replace charset with actual encoding + has_http_equiv_content_type = false + content_index = -1 + token[:data].each_with_index do |(name,value),i| + if name.downcase == 'charset' + token[:data][i] = ['charset', @encoding] + meta_found = true + break + elsif name == 'http-equiv' and value.downcase == 'content-type' + has_http_equiv_content_type = true + elsif name == 'content' + content_index = i + end + end + + if not meta_found + if has_http_equiv_content_type and content_index >= 0 + token[:data][content_index][1] = + 'text/html; charset=%s' % @encoding + meta_found = true + end end elsif token[:name].downcase == "head" and not meta_found # insert meta into empty head - yield({:type => :StartTag, :name => "head", :data => {}}) - yield({:type => :EmptyTag, :name => "meta", - :data => {"charset" => @encoding}}) - yield({:type => :EndTag, :name => "head"}) + yield(:type => :StartTag, :name => "head", :data => token[:data]) + yield(:type => :EmptyTag, :name => "meta", + :data => [["charset", @encoding]]) + yield(:type => :EndTag, :name => "head") meta_found = true next end @@ -42,8 +65,8 @@ module HTML5lib if token[:name].downcase == "head" and pending.any? # insert meta into head (if necessary) and flush pending queue yield pending.shift - yield({:type => :EmptyTag, :name => "meta", - :data => {"charset" => @encoding}}) if not meta_found + yield(:type => :EmptyTag, :name => "meta", + :data => [["charset", @encoding]]) if not meta_found yield pending.shift while pending.any? meta_found = true state = :post_head diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb b/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb index 4008778b..bf48930a 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb @@ -62,7 +62,8 @@ module HTML5lib @errors = [] @tokenizer = @tokenizer.class unless Class === @tokenizer - @tokenizer = @tokenizer.new(stream, :encoding => encoding, :parseMeta => innerHTML) + @tokenizer = @tokenizer.new(stream, :encoding => encoding, + :parseMeta => !innerHTML) if innerHTML case @innerHTML = container.downcase @@ -99,10 +100,13 @@ module HTML5lib case token[:type] when :Characters, :SpaceCharacters, :Comment @phase.send method, token[:data] - when :StartTag, :Doctype + when :StartTag @phase.send method, token[:name], token[:data] when :EndTag @phase.send method, token[:name] + when :Doctype + @phase.send method, token[:name], token[:publicId], + token[:systemId], token[:correct] else parseError(token[:data]) end @@ -147,10 +151,6 @@ module HTML5lib raise ParseError if @strict end - # This error is not an error - def atheistParseError - end - # HTML5 specific normalizations to the token stream def normalizeToken(token) @@ -160,9 +160,7 @@ module HTML5lib # element. If it matches a void element atheists did the wrong # thing and if it doesn't it's wrong for everyone. - if VOID_ELEMENTS.include?(token[:name]) - atheistParseError - else + unless VOID_ELEMENTS.include?(token[:name]) parseError(_('Solidus (/) incorrectly placed in tag.')) end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/before_head_phase.rb b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/before_head_phase.rb index 87b301a2..98a9d023 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/before_head_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/before_head_phase.rb @@ -5,7 +5,7 @@ module HTML5lib handle_start 'html', 'head' - handle_end 'html' + handle_end %w( html head body br ) => 'ImplyHead' def processEOF startTagHead('head', {}) @@ -28,7 +28,7 @@ module HTML5lib @parser.phase.processStartTag(name, attributes) end - def endTagHtml(name) + def endTagImplyHead(name) startTagHead('head', {}) @parser.phase.processEndTag(name) end @@ -38,4 +38,4 @@ module HTML5lib end end -end \ No newline at end of file +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_body_phase.rb b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_body_phase.rb index a00eb291..57720292 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_body_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_body_phase.rb @@ -5,15 +5,20 @@ module HTML5lib # http://www.whatwg.org/specs/web-apps/current-work/#in-body - handle_start 'html', 'body', 'form', 'plaintext', 'a', 'button', 'xmp', 'table', 'hr', 'image' + handle_start 'html' + handle_start %w( base link meta script style ) => 'ProcessInHead' + handle_start 'title' - handle_start 'input', 'textarea', 'select', 'isindex', %w( script style ), %w( marquee object ) + handle_start 'body', 'form', 'plaintext', 'a', 'button', 'xmp', 'table', 'hr', 'image' - handle_start %w( li dd dt ) => 'ListItem', %w( base link meta title ) => 'FromHead' + handle_start 'input', 'textarea', 'select', 'isindex', %w( marquee object ) + + handle_start %w( li dd dt ) => 'ListItem' handle_start %w( address blockquote center dir div dl fieldset listing menu ol p pre ul ) => 'CloseP' - handle_start %w( b big em font i nobr s small strike strong tt u ) => 'Formatting' + handle_start %w( b big em font i s small strike strong tt u ) => 'Formatting' + handle_start 'nobr' handle_start %w( area basefont bgsound br embed img param spacer wbr ) => 'VoidFormatting' @@ -33,7 +38,9 @@ module HTML5lib handle_end %w( head frameset select optgroup option table caption colgroup col thead tfoot tbody tr td th ) => 'Misplaced' - handle_end %w( area basefont bgsound br embed hr image img input isindex param spacer wbr frame ) => 'None' + handle_end 'br' + + handle_end %w( area basefont bgsound embed hr image img input isindex param spacer wbr frame ) => 'None' handle_end %w( noframes noscript noembed textarea xmp iframe ) => 'CdataTextAreaXmp' @@ -73,11 +80,11 @@ module HTML5lib @tree.insertText(data) end - def startTagScriptStyle(name, attributes) + def startTagProcessInHead(name, attributes) @parser.phases[:inHead].processStartTag(name, attributes) end - def startTagFromHead(name, attributes) + def startTagTitle(name, attributes) @parser.parseError(_("Unexpected start tag (#{name}) that belongs in the head. Moved.")) @parser.phases[:inHead].processStartTag(name, attributes) end @@ -120,7 +127,12 @@ module HTML5lib @tree.openElements.reverse.each_with_index do |node, i| if stopName.include?(node.name) - (i + 1).times { @tree.openElements.pop } + poppedNodes = (0..i).collect { @tree.openElements.pop } + if i >= 1 + @parser.parseError("Missing end tag%s (%s)" % [ + (i>1 ? 's' : ''), + poppedNodes.reverse.map {|item| item.name}.join(', ')]) + end break end @@ -142,15 +154,19 @@ module HTML5lib def startTagHeading(name, attributes) endTagP('p') if in_scope?('p') - HEADING_ELEMENTS.each do |element| - if in_scope?(element) - @parser.parseError(_("Unexpected start tag (#{name}).")) - - remove_open_elements_until { |element| HEADING_ELEMENTS.include?(element.name) } - break - end - end + # Uncomment the following for IE7 behavior: + # HEADING_ELEMENTS.each do |element| + # if in_scope?(element) + # @parser.parseError(_("Unexpected start tag (#{name}).")) + # + # remove_open_elements_until do |element| + # HEADING_ELEMENTS.include?(element.name) + # end + # + # break + # end + # end @tree.insertElement(name, attributes) end @@ -170,6 +186,12 @@ module HTML5lib addFormattingElement(name, attributes) end + def startTagNobr(name, attributes) + @tree.reconstructActiveFormattingElements + processEndTag('nobr') if in_scope?('nobr') + addFormattingElement(name, attributes) + end + def startTagButton(name, attributes) if in_scope?('button') @parser.parseError(_('Unexpected start tag (button) implied end tag (button).')) @@ -497,6 +519,13 @@ module HTML5lib @parser.parseError(_("Unexpected end tag (#{name}). Ignored.")) end + def endTagBr(name) + @parser.parseError(_("Unexpected end tag (br). Treated as br element.")) + @tree.reconstructActiveFormattingElements + @tree.insertElement(name, {}) + @tree.openElements.pop() + end + def endTagNone(name) # This handles elements with no end tag. @parser.parseError(_("This tag (#{name}) has no end tag")) diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_head_phase.rb b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_head_phase.rb index 4060114a..20b37653 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_head_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_head_phase.rb @@ -5,7 +5,9 @@ module HTML5lib handle_start 'html', 'head', 'title', 'style', 'script', %w( base link meta ) - handle_end 'head', 'html', %w( title style script ) + handle_end 'head' + handle_end %w( html body br ) => 'ImplyAfterHead' + handle_end %w( title style script ) def processEOF if ['title', 'style', 'script'].include?(name = @tree.openElements[-1].name) @@ -63,7 +65,11 @@ module HTML5lib def startTagBaseLinkMeta(name, attributes) element = @tree.createElement(name, attributes) - appendToHead(element) + if @tree.headPointer != nil and @parser.phase == @parser.phases[:inHead] + appendToHead(element) + else + @tree.openElements[-1].appendChild(element) + end end def startTagOther(name, attributes) @@ -80,7 +86,7 @@ module HTML5lib @parser.phase = @parser.phases[:afterHead] end - def endTagHtml(name) + def endTagImplyAfterHead(name) anythingElse @parser.phase.processEndTag(name) end @@ -117,4 +123,4 @@ module HTML5lib end end -end \ No newline at end of file +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb index 808ac03c..be38c53e 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_table_phase.rb @@ -89,10 +89,10 @@ module HTML5lib def endTagOther(name) @parser.parseError(_("Unexpected end tag (#{name}) in table context caused voodoo mode.")) # Make all the special element rearranging voodoo kick in - @parser.insertFromTable = true + @tree.insertFromTable = true # Process the end tag in the "in body" mode @parser.phases[:inBody].processEndTag(name) - @parser.insertFromTable = false + @tree.insertFromTable = false end protected diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/initial_phase.rb b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/initial_phase.rb index 9914543b..aeb0afdd 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/initial_phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/initial_phase.rb @@ -17,9 +17,95 @@ module HTML5lib @tree.insertComment(data, @tree.document) end - def processDoctype(name, error) - @parser.parseError(_('Erroneous DOCTYPE.')) if error + def processDoctype(name, publicId, systemId, correct) + if name.downcase != 'html' or publicId or systemId + @parser.parseError(_('Erroneous DOCTYPE.')) + end + # XXX need to update DOCTYPE tokens @tree.insertDoctype(name) + + publicId = publicId.to_s.upcase + + if name.downcase != 'html' + # XXX quirks mode + else + if ["+//silmaril//dtd html pro v0r11 19970101//en", + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en", + "-//as//dtd html 3.0 aswedit + extensions//en", + "-//ietf//dtd html 2.0 level 1//en", + "-//ietf//dtd html 2.0 level 2//en", + "-//ietf//dtd html 2.0 strict level 1//en", + "-//ietf//dtd html 2.0 strict level 2//en", + "-//ietf//dtd html 2.0 strict//en", + "-//ietf//dtd html 2.0//en", + "-//ietf//dtd html 2.1e//en", + "-//ietf//dtd html 3.0//en", + "-//ietf//dtd html 3.0//en//", + "-//ietf//dtd html 3.2 final//en", + "-//ietf//dtd html 3.2//en", + "-//ietf//dtd html 3//en", + "-//ietf//dtd html level 0//en", + "-//ietf//dtd html level 0//en//2.0", + "-//ietf//dtd html level 1//en", + "-//ietf//dtd html level 1//en//2.0", + "-//ietf//dtd html level 2//en", + "-//ietf//dtd html level 2//en//2.0", + "-//ietf//dtd html level 3//en", + "-//ietf//dtd html level 3//en//3.0", + "-//ietf//dtd html strict level 0//en", + "-//ietf//dtd html strict level 0//en//2.0", + "-//ietf//dtd html strict level 1//en", + "-//ietf//dtd html strict level 1//en//2.0", + "-//ietf//dtd html strict level 2//en", + "-//ietf//dtd html strict level 2//en//2.0", + "-//ietf//dtd html strict level 3//en", + "-//ietf//dtd html strict level 3//en//3.0", + "-//ietf//dtd html strict//en", + "-//ietf//dtd html strict//en//2.0", + "-//ietf//dtd html strict//en//3.0", + "-//ietf//dtd html//en", + "-//ietf//dtd html//en//2.0", + "-//ietf//dtd html//en//3.0", + "-//metrius//dtd metrius presentational//en", + "-//microsoft//dtd internet explorer 2.0 html strict//en", + "-//microsoft//dtd internet explorer 2.0 html//en", + "-//microsoft//dtd internet explorer 2.0 tables//en", + "-//microsoft//dtd internet explorer 3.0 html strict//en", + "-//microsoft//dtd internet explorer 3.0 html//en", + "-//microsoft//dtd internet explorer 3.0 tables//en", + "-//netscape comm. corp.//dtd html//en", + "-//netscape comm. corp.//dtd strict html//en", + "-//o'reilly and associates//dtd html 2.0//en", + "-//o'reilly and associates//dtd html extended 1.0//en", + "-//spyglass//dtd html 2.0 extended//en", + "-//sq//dtd html 2.0 hotmetal + extensions//en", + "-//sun microsystems corp.//dtd hotjava html//en", + "-//sun microsystems corp.//dtd hotjava strict html//en", + "-//w3c//dtd html 3 1995-03-24//en", + "-//w3c//dtd html 3.2 draft//en", + "-//w3c//dtd html 3.2 final//en", + "-//w3c//dtd html 3.2//en", + "-//w3c//dtd html 3.2s draft//en", + "-//w3c//dtd html 4.0 frameset//en", + "-//w3c//dtd html 4.0 transitional//en", + "-//w3c//dtd html experimental 19960712//en", + "-//w3c//dtd html experimental 970421//en", + "-//w3c//dtd w3 html//en", + "-//w3o//dtd w3 html 3.0//en", + "-//w3o//dtd w3 html 3.0//en//", + "-//w3o//dtd w3 html strict 3.0//en//", + "-//webtechs//dtd mozilla html 2.0//en", + "-//webtechs//dtd mozilla html//en", + "-/w3c/dtd html 4.0 transitional/en", + "html"].include?(publicId) or + (systemId == nil and + ["-//w3c//dtd html 4.01 frameset//EN", + "-//w3c//dtd html 4.01 transitional//EN"].include?(publicId)) or + (systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") + #XXX quirks mode + end + end + @parser.phase = @parser.phases[:rootElement] end @@ -46,4 +132,4 @@ module HTML5lib end end -end \ No newline at end of file +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb index 6a271504..d451eb37 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/phase.rb @@ -101,7 +101,7 @@ module HTML5lib @tree.insertComment(data, @tree.openElements[-1]) end - def processDoctype(name, error) + def processDoctype(name, publicId, systemId, correct) @parser.parseError(_('Unexpected DOCTYPE. Ignored.')) end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb b/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb index 1436e3bb..3abb5b67 100755 --- a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb @@ -33,9 +33,6 @@ module HTML5lib options.each { |name, value| instance_variable_set("@#{name}", value) } - # List of where new lines occur - @new_lines = [0] - # Raw Stream @raw_stream = open_stream(source) @@ -77,6 +74,8 @@ module HTML5lib # Reset position in the list to read from @tell = 0 + @line = @col = 0 + @line_lengths = [] end # Produces a file object from source. @@ -112,7 +111,7 @@ module HTML5lib require 'UniversalDetector' # gem install chardet buffer = @raw_stream.read encoding = UniversalDetector::chardet(buffer)['encoding'] - @raw_stream = open_stream(buffer) + seek(buffer, 0) rescue LoadError end end @@ -122,7 +121,7 @@ module HTML5lib encoding = @DEFAULT_ENCODING end - #Substitute for equivalent encodings: + #Substitute for equivalent encodings encoding_sub = {'iso-8859-1' => 'windows-1252'} if encoding_sub.has_key?(encoding.downcase) @@ -145,7 +144,6 @@ module HTML5lib } # Go to beginning of file and read in 4 bytes - @raw_stream.seek(0) string = @raw_stream.read(4) return nil unless string @@ -162,30 +160,80 @@ module HTML5lib end end - #AT - move this to the caller? # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream - @raw_stream.seek(encoding ? seek : 0) + seek(string, encoding ? seek : 0) return encoding end + def seek(buffer, n) + if @raw_stream.respond_to?(:unget) + @raw_stream.unget(buffer[n..-1]) + return + end + + if @raw_stream.respond_to?(:seek) + begin + @raw_stream.seek(n) + return + rescue Errno::ESPIPE + end + end + + require 'delegate' + @raw_stream = SimpleDelegator.new(@raw_stream) + + class << @raw_stream + def read(chars=-1) + if chars == -1 or chars > @data.length + result = @data + @data = '' + return result if __getobj__.eof? + return result + __getobj__.read if chars == -1 + return result + __getobj__.read(chars-result.length) + elsif @data.empty? + return __getobj__.read(chars) + else + result = @data[1...chars] + @data = @data[chars..-1] + return result + end + end + + def unget(data) + if !@data or @data.empty? + @data = data + else + @data += data + end + end + end + + @raw_stream.unget(buffer[n .. -1]) + end + # Report the encoding declared by the meta element def detect_encoding_meta - parser = EncodingParser.new(@raw_stream.read(@NUM_BYTES_META)) - @raw_stream.seek(0) + buffer = @raw_stream.read(@NUM_BYTES_META) + parser = EncodingParser.new(buffer) + seek(buffer, 0) return parser.get_encoding end # Returns (line, col) of the current position in the stream. def position - line = 0 - @new_lines.each do |pos| - break unless pos < @tell - line += 1 + line, col = @line, @col + @queue.reverse.each do |c| + if c == "\n" + line -= 1 + raise RuntimeError.new("col=#{col}") unless col == 0 + col = @line_lengths[line] + else + col -= 1 + end end - col = @tell - @new_lines[line-1] - 1 - return [line, col] + return [line+1, col] end # Read one character from the stream or queue if available. Return @@ -205,9 +253,14 @@ module HTML5lib c = 0x0A end - # record where newlines occur so that the position method - # can tell where it is - @new_lines << @tell-1 if c == 0x0A + # update position in stream + if c == 0x0a + @line_lengths << @col + @line += 1 + @col = 0 + else + @col += 1 + end c.chr @@ -261,11 +314,7 @@ module HTML5lib # Put the character stopped on back to the front of the queue # from where it came. c = char_stack.pop - if c == :EOF or @data_stream[@tell-1] == c[0] - @tell -= 1 - else - @queue.insert(0, c) - end + @queue.insert(0, c) unless c == :EOF return char_stack.join('') end end @@ -454,7 +503,7 @@ module HTML5lib space_found = false #Step 5 attribute name while true - if @data.current_byte == '=' and attr_name: + if @data.current_byte == '=' and attr_name break elsif SPACE_CHARACTERS.include?(@data.current_byte) space_found = true diff --git a/vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb b/vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb index 5410b98e..bbcf0eac 100755 --- a/vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb @@ -69,15 +69,22 @@ module HTML5lib # ensure that non-void XHTML elements have content so that separate # open and close tags are emitted - if token[:type] == :EndTag and \ - not VOID_ELEMENTS.include? token[:name] and \ - token[:name] == @tree.openElements[-1].name and \ - not @tree.openElements[-1].hasContent - @tree.insertText('') unless - @tree.openElements.any? {|e| - e.attributes.keys.include? 'xmlns' and - e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml' - } + if token[:type] == :EndTag + if VOID_ELEMENTS.include? token[:name] + if @tree.openElements[-1].name != token["name"]: + token[:type] = :EmptyTag + token["data"] ||= {} + end + else + if token[:name] == @tree.openElements[-1].name and \ + not @tree.openElements[-1].hasContent + @tree.insertText('') unless + @tree.openElements.any? {|e| + e.attributes.keys.include? 'xmlns' and + e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml' + } + end + end end return token diff --git a/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb b/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb index 8fe95ed2..cd4c66a6 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb @@ -1,178 +1,2 @@ -require 'html5lib/constants' - -module HTML5lib - - class HTMLSerializer - CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript] - - def self.serialize(stream, options = {}) - new(options).serialize(stream, options[:encoding]) - end - - def initialize(options={}) - @quote_attr_values = false - @quote_char = '"' - @use_best_quote_char = true - @minimize_boolean_attributes = true - - @use_trailing_solidus = false - @space_before_trailing_solidus = true - - @omit_optional_tags = true - @sanitize = false - - @strip_whitespace = false - - @inject_meta_charset = true - - options.each do |name, value| - next unless %w(quote_attr_values quote_char use_best_quote_char - minimize_boolean_attributes use_trailing_solidus - space_before_trailing_solidus omit_optional_tags sanitize - strip_whitespace inject_meta_charset).include? name.to_s - @use_best_quote_char = false if name.to_s == 'quote_char' - instance_variable_set("@#{name}", value) - end - - @errors = [] - end - - def serialize(treewalker, encoding=nil) - in_cdata = false - @errors = [] - - if encoding and @inject_meta_charset - require 'html5lib/filters/inject_meta_charset' - treewalker = Filters::InjectMetaCharset.new(treewalker, encoding) - end - - if @strip_whitespace - require 'html5lib/filters/whitespace' - treewalker = Filters::WhitespaceFilter.new(treewalker) - end - - if @sanitize - require 'html5lib/filters/sanitizer' - treewalker = Filters::HTMLSanitizeFilter.new(treewalker) - end - - if @omit_optional_tags - require 'html5lib/filters/optionaltags' - treewalker = Filters::OptionalTagFilter.new(treewalker) - end - - result = [] - treewalker.each do |token| - type = token[:type] - if type == :Doctype - doctype = "" % token[:name] - result << doctype - - elsif [:Characters, :SpaceCharacters].include? type - if type == :SpaceCharacters or in_cdata - if in_cdata and token[:data].include?("") - serializeError(_("Unexpected in CDATA")) - end - result << token[:data] - else - result << token[:data]. - gsub("&", "&"). - gsub("<", "<"). - gsub(">", ">") - end - - elsif [:StartTag, :EmptyTag].include? type - name = token[:name] - if CDATA_ELEMENTS.include?(name) - in_cdata = true - elsif in_cdata - serializeError(_("Unexpected child element of a CDATA element")) - end - attributes = [] - for k,v in attrs = token[:data].to_a.sort - attributes << ' ' - - attributes << k - if not @minimize_boolean_attributes or \ - (!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \ - and !BOOLEAN_ATTRIBUTES[:global].include?(k)) - attributes << "=" - if @quote_attr_values or v.empty? - quote_attr = true - else - quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)} - end - v = v.gsub("&", "&") - if quote_attr - quote_char = @quote_char - if @use_best_quote_char - if v.index("'") and !v.index('"') - quote_char = '"' - elsif v.index('"') and !v.index("'") - quote_char = "'" - end - end - if quote_char == "'" - v = v.gsub("'", "'") - else - v = v.gsub('"', """) - end - attributes << quote_char << v << quote_char - else - attributes << v - end - end - end - if VOID_ELEMENTS.include?(name) and @use_trailing_solidus - if @space_before_trailing_solidus - attributes << " /" - else - attributes << "/" - end - end - result << "<%s%s>" % [name, attributes.join('')] - - elsif type == :EndTag - name = token[:name] - if CDATA_ELEMENTS.include?(name) - in_cdata = false - elsif in_cdata - serializeError(_("Unexpected child element of a CDATA element")) - end - end_tag = "#{name}>" - result << end_tag - - elsif type == :Comment - data = token[:data] - serializeError(_("Comment contains --")) if data.index("--") - comment = "" % token[:data] - result << comment - - else - serializeError(token[:data]) - end - end - - if encoding and encoding != 'utf-8' - require 'iconv' - Iconv.iconv(encoding, 'utf-8', result.join('')).first - else - result.join('') - end - end - - alias :render :serialize - - def serializeError(data="XXX ERROR MESSAGE NEEDED") - # XXX The idea is to make data mandatory. - @errors.push(data) - if @strict - raise SerializeError - end - end - end - - # Error in serialized tree - class SerializeError < Exception - end -end +require 'html5lib/serializer/htmlserializer' +require 'html5lib/serializer/xhtmlserializer' diff --git a/vendor/plugins/HTML5lib/lib/html5lib/serializer/htmlserializer.rb b/vendor/plugins/HTML5lib/lib/html5lib/serializer/htmlserializer.rb new file mode 100644 index 00000000..a03b7d79 --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5lib/serializer/htmlserializer.rb @@ -0,0 +1,177 @@ +require 'html5lib/constants' + +module HTML5lib + + class HTMLSerializer + + def self.serialize(stream, options = {}) + new(options).serialize(stream, options[:encoding]) + end + + def escape(string) + string.gsub("&", "&").gsub("<", "<").gsub(">", ">") + end + + def initialize(options={}) + @quote_attr_values = false + @quote_char = '"' + @use_best_quote_char = true + @minimize_boolean_attributes = true + + @use_trailing_solidus = false + @space_before_trailing_solidus = true + @escape_lt_in_attrs = false + + @omit_optional_tags = true + @sanitize = false + + @strip_whitespace = false + + @inject_meta_charset = true + + options.each do |name, value| + next unless instance_variables.include?("@#{name}") + @use_best_quote_char = false if name.to_s == 'quote_char' + instance_variable_set("@#{name}", value) + end + + @errors = [] + end + + def serialize(treewalker, encoding=nil) + in_cdata = false + @errors = [] + + if encoding and @inject_meta_charset + require 'html5lib/filters/inject_meta_charset' + treewalker = Filters::InjectMetaCharset.new(treewalker, encoding) + end + + if @strip_whitespace + require 'html5lib/filters/whitespace' + treewalker = Filters::WhitespaceFilter.new(treewalker) + end + + if @sanitize + require 'html5lib/filters/sanitizer' + treewalker = Filters::HTMLSanitizeFilter.new(treewalker) + end + + if @omit_optional_tags + require 'html5lib/filters/optionaltags' + treewalker = Filters::OptionalTagFilter.new(treewalker) + end + + result = [] + treewalker.each do |token| + type = token[:type] + if type == :Doctype + doctype = "" % token[:name] + result << doctype + + elsif [:Characters, :SpaceCharacters].include? type + if type == :SpaceCharacters or in_cdata + if in_cdata and token[:data].include?("") + serializeError(_("Unexpected in CDATA")) + end + result << token[:data] + else + result << escape(token[:data]) + end + + elsif [:StartTag, :EmptyTag].include? type + name = token[:name] + if RCDATA_ELEMENTS.include?(name) + in_cdata = true + elsif in_cdata + serializeError(_("Unexpected child element of a CDATA element")) + end + attributes = [] + for k,v in attrs = token[:data].to_a.sort + attributes << ' ' + + attributes << k + if not @minimize_boolean_attributes or \ + (!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \ + and !BOOLEAN_ATTRIBUTES[:global].include?(k)) + attributes << "=" + if @quote_attr_values or v.empty? + quote_attr = true + else + quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)} + end + v = v.gsub("&", "&") + v = v.gsub("<", "<") if @escape_lt_in_attrs + if quote_attr + quote_char = @quote_char + if @use_best_quote_char + if v.index("'") and !v.index('"') + quote_char = '"' + elsif v.index('"') and !v.index("'") + quote_char = "'" + end + end + if quote_char == "'" + v = v.gsub("'", "'") + else + v = v.gsub('"', """) + end + attributes << quote_char << v << quote_char + else + attributes << v + end + end + end + if VOID_ELEMENTS.include?(name) and @use_trailing_solidus + if @space_before_trailing_solidus + attributes << " /" + else + attributes << "/" + end + end + result << "<%s%s>" % [name, attributes.join('')] + + elsif type == :EndTag + name = token[:name] + if RCDATA_ELEMENTS.include?(name) + in_cdata = false + elsif in_cdata + serializeError(_("Unexpected child element of a CDATA element")) + end + end_tag = "#{name}>" + result << end_tag + + elsif type == :Comment + data = token[:data] + serializeError(_("Comment contains --")) if data.index("--") + comment = "" % token[:data] + result << comment + + else + serializeError(token[:data]) + end + end + + if encoding and encoding != 'utf-8' + require 'iconv' + Iconv.iconv(encoding, 'utf-8', result.join('')).first + else + result.join('') + end + end + + alias :render :serialize + + def serializeError(data="XXX ERROR MESSAGE NEEDED") + # XXX The idea is to make data mandatory. + @errors.push(data) + if @strict + raise SerializeError + end + end + end + + # Error in serialized tree + class SerializeError < Exception + end +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/serializer/xhtmlserializer.rb b/vendor/plugins/HTML5lib/lib/html5lib/serializer/xhtmlserializer.rb new file mode 100644 index 00000000..43a63788 --- /dev/null +++ b/vendor/plugins/HTML5lib/lib/html5lib/serializer/xhtmlserializer.rb @@ -0,0 +1,19 @@ +require 'html5lib/serializer/htmlserializer' + +module HTML5lib + + class XHTMLSerializer < HTMLSerializer + DEFAULTS = { + :quote_attr_values => true, + :minimize_boolean_attributes => false, + :use_trailing_solidus => true, + :escape_lt_in_attrs => true, + :omit_optional_tags => false + } + + def initialize(options={}) + super(DEFAULTS.clone.update(options)) + end + end + +end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb b/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb index bd594e07..6519944d 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb @@ -41,19 +41,31 @@ module HTML5lib :attributeValueUnQuoted => :attributeValueUnQuotedState, :bogusComment => :bogusCommentState, :markupDeclarationOpen => :markupDeclarationOpenState, + :commentStart => :commentStartState, + :commentStartDash => :commentStartDashState, :comment => :commentState, - :commentDash => :commentDashState, + :commentEndDash => :commentEndDashState, :commentEnd => :commentEndState, :doctype => :doctypeState, :beforeDoctypeName => :beforeDoctypeNameState, :doctypeName => :doctypeNameState, :afterDoctypeName => :afterDoctypeNameState, + :beforeDoctypePublicIdentifier => :beforeDoctypePublicIdentifierState, + :doctypePublicIdentifierDoubleQuoted => :doctypePublicIdentifierDoubleQuotedState, + :doctypePublicIdentifierSingleQuoted => :doctypePublicIdentifierSingleQuotedState, + :afterDoctypePublicIdentifier => :afterDoctypePublicIdentifierState, + :beforeDoctypeSystemIdentifier => :beforeDoctypeSystemIdentifierState, + :doctypeSystemIdentifierDoubleQuoted => :doctypeSystemIdentifierDoubleQuotedState, + :doctypeSystemIdentifierSingleQuoted => :doctypeSystemIdentifierSingleQuotedState, + :afterDoctypeSystemIdentifier => :afterDoctypeSystemIdentifierState, :bogusDoctype => :bogusDoctypeState } # Setup the initial tokenizer state @contentModelFlag = :PCDATA @state = @states[:data] + @escapeFlag = false + @lastFourChars = [] # The current token being created @currentToken = nil @@ -133,24 +145,14 @@ module HTML5lib # If the integer is between 127 and 160 (so 128 and bigger and 159 and # smaller) we need to do the "windows trick". if (127...160).include? charAsInt - #XXX - removed parse error from windows 1252 entity for now - #we may want to reenable this later - #@tokenQueue.push({:type => :ParseError, :data => - # _("Entity used with illegal number (windows-1252 reference).")}) + @tokenQueue.push({:type => :ParseError, :data => + _("Entity used with illegal number (windows-1252 reference).")}) charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128] end - # 0 is not a good number. - if charAsInt == 0 - charAsInt = 65533 - end - - if charAsInt <= 0x10FFFF + if charAsInt > 0 and charAsInt <= 1114111 char = [charAsInt].pack('U') - else - @tokenQueue.push({:type => :ParseError, :data => - _("Numeric entity couldn't be converted to character.")}) end # Discard the ; if present. Otherwise, put it back on the queue and @@ -167,7 +169,10 @@ module HTML5lib def consumeEntity char = nil charStack = [@stream.char] - if charStack[0] == "#" + if SPACE_CHARACTERS.include?(charStack[0]) or + [:EOF, '<', '&'].include?(charStack[0]) + @stream.queue+= charStack + elsif charStack[0] == "#" # We might have a number entity here. charStack += [@stream.char, @stream.char] if charStack.include? :EOF @@ -194,10 +199,6 @@ module HTML5lib _("Numeric entity expected but none found.")}) end end - # Break out if we reach the end of the file - elsif charStack[0] == :EOF - @tokenQueue.push({:type => :ParseError, :data => - _("Entity expected. Got end of file instead.")}) else # At this point in the process might have named entity. Entities # are stored in the global variable "entities". @@ -267,14 +268,33 @@ module HTML5lib # statements should be. def dataState data = @stream.char - if data == "&" and (@contentModelFlag == :PCDATA or - @contentModelFlag == :RCDATA) + + if @contentModelFlag == :CDATA or @contentModelFlag == :RCDATA + @lastFourChars << data + @lastFourChars.shift if @lastFourChars.length > 4 + end + + if data == "&" and [:PCDATA,:RCDATA].include?(@contentModelFlag) @state = @states[:entityData] - elsif data == "<" and @contentModelFlag != :PLAINTEXT - @state = @states[:tagOpen] + + elsif data == "-" and [:CDATA,:RCDATA].include?(@contentModelFlag) and + @escapeFlag == false and @lastFourChars.join('') == "" + @escapeFlag = false + @tokenQueue.push({:type => :Characters, :data => data}) + elsif data == :EOF # Tokenization ends. return false + elsif SPACE_CHARACTERS.include? data # Directly after emitting a token you switch back to the "data # state". At that point SPACE_CHARACTERS are important so they are @@ -285,7 +305,7 @@ module HTML5lib data + @stream.chars_until(SPACE_CHARACTERS, true)}) else @tokenQueue.push({:type => :Characters, :data => - data + @stream.chars_until(["&", "<"])}) + data + @stream.chars_until(%w[& < > -])}) end return true end @@ -380,8 +400,6 @@ module HTML5lib # emitting the end tag token. @contentModelFlag = :PCDATA else - @tokenQueue.push({:type => :ParseError, :data => - _("Expected closing tag after seeing ''. None found.")}) @tokenQueue.push({:type => :Characters, :data => ""}) @state = @states[:data] @@ -391,29 +409,27 @@ module HTML5lib end end - if @contentModelFlag == :PCDATA - data = @stream.char - if data == :EOF - @tokenQueue.push({:type => :ParseError, :data => - _("Expected closing tag. Unexpected end of file.")}) - @tokenQueue.push({:type => :Characters, :data => ""}) - @state = @states[:data] - elsif ASCII_LETTERS.include? data - @currentToken =\ - {:type => :EndTag, :name => data, :data => []} - @state = @states[:tagName] - elsif data == ">" - @tokenQueue.push({:type => :ParseError, :data => - _("Expected closing tag. Got '>' instead. Ignoring '>'.")}) - @state = @states[:data] - else - # XXX data can be _'_... - @tokenQueue.push({:type => :ParseError, :data => - _("Expected closing tag. Unexpected character '" + data + "' found.")}) - @stream.queue.push(data) - @state = @states[:bogusComment] - end + data = @stream.char + if data == :EOF + @tokenQueue.push({:type => :ParseError, :data => + _("Expected closing tag. Unexpected end of file.")}) + @tokenQueue.push({:type => :Characters, :data => ""}) + @state = @states[:data] + elsif ASCII_LETTERS.include? data + @currentToken = {:type => :EndTag, :name => data, :data => []} + @state = @states[:tagName] + elsif data == ">" + @tokenQueue.push({:type => :ParseError, :data => + _("Expected closing tag. Got '>' instead. Ignoring '>'.")}) + @state = @states[:data] + else + # XXX data can be _'_... + @tokenQueue.push({:type => :ParseError, :data => + _("Expected closing tag. Unexpected character '#{data}' found.")}) + @stream.queue.push(data) + @state = @states[:bogusComment] end + return true end @@ -430,11 +446,6 @@ module HTML5lib @stream.chars_until(ASCII_LETTERS, true) elsif data == ">" emitCurrentToken - elsif data == "<" - @stream.queue.push(data) - @tokenQueue.push({:type => :ParseError, :data => - _("Unexpected < character when getting the tag name.")}) - emitCurrentToken elsif data == "/" processSolidusInTag @state = @states[:beforeAttributeName] @@ -459,11 +470,6 @@ module HTML5lib emitCurrentToken elsif data == "/" processSolidusInTag - elsif data == "<" - @stream.queue.push(data) - @tokenQueue.push({:type => :ParseError, :data => - _("Unexpected < character. Expected attribute name instead.")}) - emitCurrentToken else @currentToken[:data].push([data, ""]) @state = @states[:attributeName] @@ -494,12 +500,6 @@ module HTML5lib elsif data == "/" processSolidusInTag @state = @states[:beforeAttributeName] - elsif data == "<" - @stream.queue.push(data) - @tokenQueue.push({:type => :ParseError, :data => - _("Unexpected < character in attribute name.")}) - emitCurrentToken - leavingThisState = false else @currentToken[:data][-1][0] += data leavingThisState = false @@ -537,11 +537,6 @@ module HTML5lib elsif data == "/" processSolidusInTag @state = @states[:beforeAttributeName] - elsif data == "<" - @stream.queue.push(data) - @tokenQueue.push({:type => :ParseError, :data => - _("Unexpected < character. Expected = or end of tag.")}) - emitCurrentToken elsif data == :EOF @tokenQueue.push({:type => :ParseError, :data => _("Unexpected end of file. Expected = or end of tag.")}) @@ -566,11 +561,6 @@ module HTML5lib @state = @states[:attributeValueSingleQuoted] elsif data == ">" emitCurrentToken - elsif data == "<" - @stream.queue.push(data) - @tokenQueue.push({:type => :ParseError, :data => - _("Unexpected < character. Expected attribute value.")}) - emitCurrentToken elsif data == :EOF @tokenQueue.push({:type => :ParseError, :data => _("Unexpected end of file. Expected attribute value.")}) @@ -624,11 +614,6 @@ module HTML5lib processEntityInAttribute elsif data == ">" emitCurrentToken - elsif data == "<" - @stream.queue.push(data) - @tokenQueue.push({:type => :ParseError, :data => - _("Unexpected < character in attribute value.")}) - emitCurrentToken elsif data == :EOF @tokenQueue.push({:type => :ParseError, :data => _("Unexpected end of file in attribute value.")}) @@ -658,14 +643,15 @@ module HTML5lib charStack = [@stream.char, @stream.char] if charStack == ["-", "-"] @currentToken = {:type => :Comment, :data => ""} - @state = @states[:comment] + @state = @states[:commentStart] else 5.times { charStack.push(@stream.char) } # Put in explicit :EOF check if ((not charStack.include? :EOF) and charStack.join("").upcase == "DOCTYPE") @currentToken =\ - {:type => :Doctype, :name => "", :data => true} + {:type => :Doctype, :name => "", + :publicId => nil, :systemId => nil, :correct => true} @state = @states[:doctype] else @tokenQueue.push({:type => :ParseError, :data => @@ -677,10 +663,52 @@ module HTML5lib return true end + def commentStartState + data = @stream.char + if data == "-" + @state = @states[:commentStartDash] + elsif data == ">" + @tokenQueue.push({:type => :ParseError, :data => + _("Incorrect comment.")}) + @tokenQueue.push(@currentToken) + @state = @states[:data] + elsif data == EOF + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected end of file in comment.")}) + @tokenQueue.push(@currentToken) + @state = @states[:data] + else + @currentToken[:data] += data + @stream.chars_until("-") + @state = @states[:comment] + end + return true + end + + def commentStartDashState + data = @stream.char + if data == "-" + @state = @states[:commentEnd] + elsif data == ">" + @tokenQueue.push({:type => :ParseError, :data => + _("Incorrect comment.")}) + @tokenQueue.push(@currentToken) + @state = @states[:data] + elsif data == EOF + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected end of file in comment.")}) + @tokenQueue.push(@currentToken) + @state = @states[:data] + else + @currentToken[:data] += data + @stream.chars_until("-") + @state = @states[:comment] + end + return true + end + def commentState data = @stream.char if data == "-" - @state = @states[:commentDash] + @state = @states[:commentEndDash] elsif data == :EOF @tokenQueue.push({:type => :ParseError, :data => _("Unexpected end of file in comment.")}) @@ -692,7 +720,7 @@ module HTML5lib return true end - def commentDashState + def commentEndDashState data = @stream.char if data == "-" @state = @states[:commentEnd] @@ -752,19 +780,16 @@ module HTML5lib def beforeDoctypeNameState data = @stream.char if SPACE_CHARACTERS.include? data - elsif ASCII_LOWERCASE.include? data - @currentToken[:name] = data.upcase - @state = @states[:doctypeName] elsif data == ">" - # Character needs to be consumed per the specification so don't - # invoke emitCurrentTokenWithParseError with :data as argument. @tokenQueue.push({:type => :ParseError, :data => _("Unexpected > character. Expected DOCTYPE name.")}) + @currentToken[:correct] = false @tokenQueue.push(@currentToken) @state = @states[:data] elsif data == :EOF @tokenQueue.push({:type => :ParseError, :data => _("Unexpected end of file. Expected DOCTYPE name.")}) + @currentToken[:correct] = false @tokenQueue.push(@currentToken) @state = @states[:data] else @@ -776,33 +801,21 @@ module HTML5lib def doctypeNameState data = @stream.char - needsDoctypeCheck = false if SPACE_CHARACTERS.include? data @state = @states[:afterDoctypeName] - needsDoctypeCheck = true elsif data == ">" @tokenQueue.push(@currentToken) @state = @states[:data] elsif data == :EOF @tokenQueue.push({:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE name.")}) + @currentToken[:correct] = false @tokenQueue.push(@currentToken) @state = @states[:data] else - # We can't just uppercase everything that arrives here. For - # instance, non-ASCII characters. - if ASCII_LOWERCASE.include? data - data = data.upcase - end @currentToken[:name] += data - needsDoctypeCheck = true end - # After some iterations through this state it should eventually say - # "HTML". Otherwise there's an error. - if needsDoctypeCheck and @currentToken[:name] == "HTML" - @currentToken[:data] = false - end return true end @@ -814,16 +827,195 @@ module HTML5lib @state = @states[:data] elsif data == :EOF @currentToken[:data] = true - # XXX EMIT @stream.queue.push(data) @tokenQueue.push({:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}) + @currentToken[:correct] = false + @tokenQueue.push(@currentToken) + @state = @states[:data] + else + charStack = [data] + 5.times { charStack << stream.char } + token = charStack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE) + if token == "public" + @state = @states[:beforeDoctypePublicIdentifier] + elsif token == "system" + @state = @states[:beforeDoctypeSystemIdentifier] + else + @stream.queue += charStack + @tokenQueue.push({:type => :ParseError, :data => + _("Expected 'public' or 'system'. Got '#{charStack.join('')}'")}) + @state = @states[:bogusDoctype] + end + end + return true + end + + def beforeDoctypePublicIdentifierState + data = @stream.char + + if SPACE_CHARACTERS.include?(data) + elsif data == "\"" + @currentToken[:publicId] = "" + @state = @states[:doctypePublicIdentifierDoubleQuoted] + elsif data == "'" + @currentToken[:publicId] = "" + @state = @states[:doctypePublicIdentifierSingleQuoted] + elsif data == ">" + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected end of DOCTYPE.")}) + @currentToken[:correct] = false + @tokenQueue.push(@currentToken) + @state = @states[:data] + elsif data == :EOF + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected end of file in DOCTYPE.")}) + @currentToken[:correct] = false @tokenQueue.push(@currentToken) @state = @states[:data] else @tokenQueue.push({:type => :ParseError, :data => - _("Expected space or '>'. Got '" + data + "'")}) - @currentToken[:data] = true + _("Unexpected character in DOCTYPE.")}) + @state = @states[:bogusDoctype] + end + + return true + end + + def doctypePublicIdentifierDoubleQuotedState + data = @stream.char + if data == "\"" + @state = @states[:afterDoctypePublicIdentifier] + elsif data == :EOF + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected end of file in DOCTYPE.")}) + @currentToken[:correct] = false + @tokenQueue.push(@currentToken) + @state = @states[:data] + else + @currentToken[:publicId] += data + end + return true + end + + def doctypePublicIdentifierSingleQuotedState + data = @stream.char + if data == "'" + @state = @states[:afterDoctypePublicIdentifier] + elsif data == :EOF + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected end of file in DOCTYPE.")}) + @currentToken[:correct] = false + @tokenQueue.push(@currentToken) + @state = @states[:data] + else + @currentToken[:publicId] += data + end + return true + end + + def afterDoctypePublicIdentifierState + data = @stream.char + if SPACE_CHARACTERS.include?(data) + elsif data == "\"" + @currentToken[:systemId] = "" + @state = @states[:doctypeSystemIdentifierDoubleQuoted] + elsif data == "'" + @currentToken[:systemId] = "" + @state = @states[:doctypeSystemIdentifierSingleQuoted] + elsif data == ">" + @tokenQueue.push(@currentToken) + @state = @states[:data] + elsif data == :EOF + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected end of file in DOCTYPE.")}) + @currentToken[:correct] = false + @tokenQueue.push(@currentToken) + @state = @states[:data] + else + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected character in DOCTYPE.")}) + @state = @states[:bogusDoctype] + end + return true + end + + def beforeDoctypeSystemIdentifierState + data = @stream.char + if SPACE_CHARACTERS.include?(data) + elsif data == "\"" + @currentToken[:systemId] = "" + @state = @states[:doctypeSystemIdentifierDoubleQuoted] + elsif data == "'" + @currentToken[:systemId] = "" + @state = @states[:doctypeSystemIdentifierSingleQuoted] + elsif data == ">" + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected character in DOCTYPE.")}) + @currentToken[:correct] = false + @tokenQueue.push(@currentToken) + @state = @states[:data] + elsif data == :EOF + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected end of file in DOCTYPE.")}) + @currentToken[:correct] = false + @tokenQueue.push(@currentToken) + @state = @states[:data] + else + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected character in DOCTYPE.")}) + @state = @states[:bogusDoctype] + end + return true + end + + def doctypeSystemIdentifierDoubleQuotedState + data = @stream.char + if data == "\"" + @state = @states[:afterDoctypeSystemIdentifier] + elsif data == :EOF + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected end of file in DOCTYPE.")}) + @currentToken[:correct] = false + @tokenQueue.push(@currentToken) + @state = @states[:data] + else + @currentToken[:systemId] += data + end + return true + end + + def doctypeSystemIdentifierSingleQuotedState + data = @stream.char + if data == "'" + @state = @states[:afterDoctypeSystemIdentifier] + elsif data == :EOF + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected end of file in DOCTYPE.")}) + @currentToken[:correct] = false + @tokenQueue.push(@currentToken) + @state = @states[:data] + else + @currentToken[:systemId] += data + end + return true + end + + def afterDoctypeSystemIdentifierState + data = @stream.char + if SPACE_CHARACTERS.include?(data) + elsif data == ">" + @tokenQueue.push(@currentToken) + @state = @states[:data] + elsif data == :EOF + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected end of file in DOCTYPE.")}) + @currentToken[:correct] = false + @tokenQueue.push(@currentToken) + @state = @states[:data] + else + @tokenQueue.push({:type => :ParseError, :data => + _("Unexpected character in DOCTYPE.")}) @state = @states[:bogusDoctype] end return true @@ -839,6 +1031,7 @@ module HTML5lib @stream.queue.push(data) @tokenQueue.push({:type => :ParseError, :data => _("Unexpected end of file in bogus doctype.")}) + @currentToken[:correct] = false @tokenQueue.push(@currentToken) @state = @states[:data] end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb index fc120827..20cc58b6 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb @@ -27,6 +27,9 @@ module HTML5lib childNodes << node hpricot.children << node.hpricot end + if (oldparent = node.hpricot.parent) != nil + oldparent.children.delete_at(oldparent.children.index(node.hpricot)) + end node.hpricot.parent = hpricot node.parent = self end diff --git a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb index bf129891..c9d12263 100644 --- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb +++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/hpricot.rb @@ -9,7 +9,7 @@ module HTML5lib def node_details(node) case node when ::Hpricot::Elem - if !node.name + if node.name.empty? [:DOCUMENT_FRAGMENT] else [:ELEMENT, node.name, diff --git a/vendor/plugins/HTML5lib/parse.rb b/vendor/plugins/HTML5lib/parse.rb index c6baaeb3..79233712 100755 --- a/vendor/plugins/HTML5lib/parse.rb +++ b/vendor/plugins/HTML5lib/parse.rb @@ -5,12 +5,20 @@ $:.unshift File.dirname(__FILE__),'lib' def parse(opts, args) + encoding = nil f = args[-1] if f begin - require 'open-uri' if f[0..6] == 'http://' - f = open(f) + if f[0..6] == 'http://' + require 'open-uri' + f = URI.parse(f).open + encoding = f.charset + elsif f == '-' + f = $stdin + else + f = open(f) + end rescue end else @@ -29,22 +37,28 @@ def parse(opts, args) p = HTML5lib::HTMLParser.new(:tree=>treebuilder) end + if opts.parsemethod == :parse + args = [f, encoding] + else + args = [f, 'div', encoding] + end + if opts.profile require 'profiler' Profiler__::start_profile - p.send(opts.parsemethod,f) + p.send(opts.parsemethod, *args) Profiler__::stop_profile Profiler__::print_profile($stderr) elsif opts.time require 'time' t0 = Time.new - document = p.send(opts.parsemethod,f) + document = p.send(opts.parsemethod, *args) t1 = Time.new printOutput(p, document, opts) t2 = Time.new puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1] else - document = p.send(opts.parsemethod,f) + document = p.send(opts.parsemethod, *args) printOutput(p, document, opts) end end @@ -59,7 +73,7 @@ def printOutput(parser, document, opts) require 'html5lib/treewalkers' tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document) require 'html5lib/serializer' - print HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer) + puts HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer) when :hilite print document.hilite when :tree @@ -93,26 +107,35 @@ options.serializer = { require 'optparse' opts = OptionParser.new do |opts| - opts.on("-p", "--[no-]profile", "Profile the run") do |profile| - options.profile = profile - end - - opts.on("-t", "--[no-]time", "Time the run") do |time| - options.time = time - end - + opts.separator "" + opts.separator "Parse Options:" + opts.on("-b", "--treebuilder NAME") do |treebuilder| options.treebuilder = treebuilder end - opts.on("-e", "--error", "Print a list of parse errors") do |error| - options.error = error - end - opts.on("-f", "--fragment", "Parse as a fragment") do |parse| options.parsemethod = :parseFragment end + opts.separator "" + opts.separator "Filter Options:" + + opts.on("--[no-]inject-meta-charset", "inject ") do |inject| + options.serializer[:inject_meta_charset] = inject + end + + opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip| + options.serializer[:strip_whitespace] = strip + end + + opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize| + options.serializer[:sanitize] = sanitize + end + + opts.separator "" + opts.separator "Output Options:" + opts.on("--tree", "output as debug tree") do |tree| options.output = :tree end @@ -130,26 +153,56 @@ opts = OptionParser.new do |opts| options.output = :hilite end - opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding| - options.encoding = encoding + opts.on("-e", "--error", "Print a list of parse errors") do |error| + options.error = error end - opts.on("--[no-]inject-meta-charset", "inject ") do |inject| - options.serializer[:inject_meta_charset] = inject - end - - opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip| - options.serializer[:strip_whitespace] = strip - end - - opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize| - options.serializer[:sanitize] = sanitize - end + opts.separator "" + opts.separator "Serialization Options:" opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit| options.serializer[:omit_optional_tags] = omit end + opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote| + options.serializer[:quote_attr_values] = quote + end + + opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best| + options.serializer[:use_best_quote_char] = best + end + + opts.on("--quote-char C", "Use specified quote character") do |c| + options.serializer[:quote_char] = c + end + + opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min| + options.serializer[:minimize_boolean_attributes] = min + end + + opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash| + options.serializer[:use_trailing_solidus] = slash + end + + opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt| + options.serializer[:escape_lt_in_attrs] = lt + end + + opts.separator "" + opts.separator "Other Options:" + + opts.on("-p", "--[no-]profile", "Profile the run") do |profile| + options.profile = profile + end + + opts.on("-t", "--[no-]time", "Time the run") do |time| + options.time = time + end + + opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding| + options.encoding = encoding + end + opts.on_tail("-h", "--help", "Show this message") do puts opts exit diff --git a/vendor/plugins/HTML5lib/testdata/encoding/test-yahoo-jp.dat b/vendor/plugins/HTML5lib/testdata/encoding/test-yahoo-jp.dat index daf61250..36292789 100644 --- a/vendor/plugins/HTML5lib/testdata/encoding/test-yahoo-jp.dat +++ b/vendor/plugins/HTML5lib/testdata/encoding/test-yahoo-jp.dat @@ -7,4 +7,4 @@ #errors -7: missing document type declaration +missing document type declaration +unexpected EOF #document | |
| " | #data @@ -1305,12 +1307,12 @@ Line1" |
#data
@@ -1381,12 +1383,11 @@ Line1
Line2
Line3
Line4
6: missing document type declaration
19: unexpected node at end of document
19: unexpected node after body element end tag
-19: meta element start tag out of place
#document
|
|
#data
@@ -1430,14 +1431,13 @@ Line1
Line2
Line3
Line4