More fixes, sync with HTML5lib

Do a better job with the wrapper <div>s added by xhtmldiff and Maruku's to_html_tree method. More tests fixed.
2007-06-13 23:05:15 -05:00 · 2007-06-13 23:05:15 -05:00 · 3de374d6c1
commit 3de374d6c1
parent 3ca33e52b5
20 changed files with 541 additions and 118 deletions
--- a/lib/chunks/engines.rb
+++ b/lib/chunks/engines.rb
@ -44,7 +44,7 @@ module Engines
      require 'maruku/ext/math'
      html = sanitize_rexml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
            {:math_enabled => false}).to_html_tree)
-      html.gsub(/\A<div>(.*)<\/div>\z/, '\1')
+      html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
    end
  end

@ -56,7 +56,7 @@ module Engines
      require 'maruku/ext/math'
      html = sanitize_rexml(Maruku.new(@content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
            {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}']}).to_html_tree)
-      html.gsub(/\A<div>(.*)<\/div>\z/, '\1')
+      html.gsub(/\A<div class="maruku_wrapper_div">\n?(.*?)\n?<\/div>\Z/m, '\1')
    end
  end

--- a/lib/page_renderer.rb
+++ b/lib/page_renderer.rb
@ -1,4 +1,5 @@
 require 'xhtmldiff'
+
 # Temporary class containing all rendering stuff from a Revision 
 # I want to shift all rendering loguc to the controller eventually

@ -43,7 +44,9 @@ class PageRenderer
      previous_content = "<div>" + WikiContent.new(previous_revision, @@url_generator).render!.to_s + "</div>"
      current_content = "<div>" + display_content.to_s  + "</div>"
      diff_doc = REXML::Document.new
-      diff_doc << (div = REXML::Element.new 'div')
+      div = REXML::Element.new('div', nil, {:respect_whitespace =>:all})
+      div.attributes['class'] = 'xhtmldiff_wrapper'
+      diff_doc << div
      hd = XHTMLDiff.new(div)

      parsed_previous_revision = REXML::HashableElementDelegator.new(
@ -54,7 +57,7 @@ class PageRenderer

      diffs = ''
      diff_doc.write(diffs, -1, true, true)
-      diffs.gsub(/^<div>(.*)<\/div>$/, '\1')
+      diffs.gsub(/\A<div class='xhtmldiff_wrapper'>(.*)<\/div>\Z/m, '\1')
    else
      display_content
    end
--- a/lib/wiki_content.rb
+++ b/lib/wiki_content.rb
@ -1,11 +1,11 @@
 require 'cgi'
-require_dependency 'chunks/engines'
-require_dependency 'chunks/category'
+require 'chunks/engines'
+require 'chunks/category'
 require_dependency 'chunks/include'
 require_dependency 'chunks/wiki'
 require_dependency 'chunks/literal'
 require_dependency 'chunks/uri'
-require_dependency 'chunks/nowiki'
+require 'chunks/nowiki'

 # Wiki content is just a string that can process itself with a chain of
 # actions. The actions can modify wiki content so that certain parts of
--- a/test/unit/diff_test.rb
+++ b/test/unit/diff_test.rb
@ -11,7 +11,9 @@ class DiffTest < Test::Unit::TestCase

  def diff(a,b)
    diff_doc = REXML::Document.new
-    diff_doc << (div = REXML::Element.new 'div' )
+    div = REXML::Element.new('div', nil, {:respect_whitespace =>:all})
+    div.attributes['class'] = 'xhtmldiff_wrapper'
+    diff_doc << div
    hd = XHTMLDiff.new(div)
    parsed_a = REXML::HashableElementDelegator.new(
           REXML::XPath.first(REXML::Document.new("<div>"+a+"</div>"), '/div'))
@ -20,14 +22,14 @@ class DiffTest < Test::Unit::TestCase
    Diff::LCS.traverse_balanced(parsed_a, parsed_b, hd)
    diffs = ''
    diff_doc.write(diffs, -1, true, true)
-    diffs
+    diffs.gsub(/\A<div class='xhtmldiff_wrapper'>(.*)<\/div>\Z/m, '\1')
  end

  def test_html_diff_simple
    a = 'this was the original string'
    b = 'this is the new string'
-    assert_equal("<div><span> this<del class='diffmod'> was</del><ins class='diffmod'> is</ins> the" +
-           "<del class='diffmod'> original</del><ins class='diffmod'> new</ins> string</span></div>",
+    assert_equal("<span> this<del class='diffmod'> was</del><ins class='diffmod'> is</ins> the" +
+           "<del class='diffmod'> original</del><ins class='diffmod'> new</ins> string</span>",
          diff(a, b))
  end

@ -35,10 +37,10 @@ class DiffTest < Test::Unit::TestCase
    a = "<p>this was the original string</p>"
    b = "<p>this is</p>\n<p> the new string</p>\n<p>around the world</p>"
    assert_equal(
-        "<div><p><span> this<del class='diffmod'> was</del><ins class='diffmod'> is</ins>" +
+        "<p><span> this<del class='diffmod'> was</del><ins class='diffmod'> is</ins>" +
        "<del class='diffdel'> the</del><del class='diffdel'> original</del><del class='diffdel'> string</del></span></p>" +
        "<ins class='diffins'>\n</ins><ins class='diffins'><p> the new string</p></ins>" +
-        "<ins class='diffins'>\n</ins><ins class='diffins'><p>around the world</p></ins></div>",
+        "<ins class='diffins'>\n</ins><ins class='diffins'><p>around the world</p></ins>",
        diff(a, b))
  end

@ -46,8 +48,8 @@ class DiffTest < Test::Unit::TestCase
    a = "<p>this is a paragraph</p>\n<p>this is a second paragraph</p>\n<p>this is a third paragraph</p>"
    b = "<p>this is a paragraph</p>\n<p>this is a third paragraph</p>"
    assert_equal(
-         "<div><p>this is a paragraph</p>\n<del class='diffdel'><p>this is a second paragraph</p></del>" +
-         "<del class='diffdel'>\n</del><p>this is a third paragraph</p></div>",
+         "<p>this is a paragraph</p>\n<del class='diffdel'><p>this is a second paragraph</p></del>" +
+         "<del class='diffdel'>\n</del><p>this is a third paragraph</p>",
        diff(a, b))
  end

@ -55,8 +57,8 @@ class DiffTest < Test::Unit::TestCase
     a = "<p>foo bar</p>"
     b = "<p>foo</p><p>bar</p>"
     assert_equal(
-       "<div><p><span> foo<del class='diffdel'> bar</del></span></p>" +
-       "<ins class='diffins'><p>bar</p></ins></div>",
+       "<p><span> foo<del class='diffdel'> bar</del></span></p>" +
+       "<ins class='diffins'><p>bar</p></ins>",
      diff(a,b))
  end

@ -64,8 +66,8 @@ class DiffTest < Test::Unit::TestCase
     a = "<p>foo</p><p>bar</p>"
     b = "<p>foo bar</p>"
     assert_equal(
-       "<div><p><span> foo<ins class='diffins'> bar</ins></span></p>" +
-       "<del class='diffdel'><p>bar</p></del></div>",
+       "<p><span> foo<ins class='diffins'> bar</ins></span></p>" +
+       "<del class='diffdel'><p>bar</p></del>",
      diff(a,b))
  end

@ -73,31 +75,31 @@ class DiffTest < Test::Unit::TestCase
     a = "<p>foo bar</p>"
     b = "<p>foo <b>bar</b></p>"
     assert_equal(
-        "<div><p><span> foo<del class='diffdel'> bar</del></span>" +
-        "<ins class='diffins'><b>bar</b></ins></p></div>",
+        "<p><span> foo<del class='diffdel'> bar</del></span>" +
+        "<ins class='diffins'><b>bar</b></ins></p>",
       diff(a,b))
  end

+  def test_html_diff_with_tags
+    a = ""
+    b = "<div>foo</div>"
+    assert_equal "<ins class='diffins'><div>foo</div></ins>", diff(a, b)
+  end
+  
  # FIXME this test fails (ticket #67, http://dev.instiki.org/ticket/67)
  def test_html_diff_preserves_endlines_in_pre
    a = "<pre>a\nb\nc\n</pre>"
    b = "<pre>a\n</pre>"
    assert_equal(
-        "<div><pre><span> a\n<del class='diffdel'>b\nc\n</del></span></pre></div>",
+        "<pre><span> a\n<del class='diffdel'>b\nc\n</del></span></pre>",
        diff(a, b))
  end
  
-  def test_html_diff_with_tags
-    a = ""
-    b = "<div>foo</div>"
-    assert_equal "<div><ins class='diffins'><div>foo</div></ins></div>", diff(a, b)
-  end
-  
+  # FIXME. xhtmldiff fails to detect any change here
  def test_diff_for_tag_change
    a = "<a>x</a>"
    b = "<b>x</b>"
-    # FIXME. xhtmldiff fails to detect any change here
-    assert_equal "<div><del class='diffdel'><a>x</a></del><ins class='diffins'><b>x</b></ins></div>", diff(a, b)
+    assert_equal "<del class='diffdel'><a>x</a></del><ins class='diffins'><b>x</b></ins>", diff(a, b)
  end

 end
--- a/test/unit/maruku_tex.rb
+++ b/test/unit/maruku_tex.rb
@ -0,0 +1,68 @@
+#!/usr/bin/env ruby
+
+require File.dirname(__FILE__) + '/../test_helper'
+
+class RedClothForTexTest < Test::Unit::TestCase
+  def test_basics
+    assert_equal '{\bf First Page}', Maruku.new('*First Page*').to_latex
+    assert_equal '{\em First Page}', Maruku.new('_First Page_').to_latex
+    assert_equal "\\begin{itemize}\n\t\\item A\n\t\t\\item B\n\t\t\\item C\n\t\\end{itemize}", Maruku.new('* A\n* B\n* C').to_latex
+  end
+  
+  def test_blocks
+    assert_equal '\section*{hello}', Maruku.new('#hello#').to_latex 
+    assert_equal '\subsection*{hello}', Maruku.new('##hello##').to_latex
+  end
+  
+  def test_table_of_contents
+
+source = <<EOL
+* [[A]]
+** [[B]]
+** [[C]]
+* D
+** [[E]]
+*** F
+EOL
+
+expected_result = <<EOL
+\\pagebreak
+
+\\section{A}
+Abe
+
+\\subsection{B}
+Babe
+
+\\subsection{C}
+\\pagebreak
+
+\\section{D}
+
+\\subsection{E}
+
+\\subsubsection{F}
+EOL
+    expected_result.chop!
+    assert_equal(expected_result, table_of_contents(source, 'A' => 'Abe', 'B' => 'Babe'))
+  end
+  
+  def test_entities
+    assert_equal "Beck \\& Fowler are 100\\% cool", RedClothForTex.new("Beck & Fowler are 100% cool").to_tex
+  end
+
+  def test_bracket_links
+    assert_equal "such a Horrible Day, but I won't be Made Useless", RedClothForTex.new("such a [[Horrible Day]], but I won't be [[Made Useless]]").to_tex
+  end
+  
+  def test_footnotes_on_abbreviations
+    assert_equal(
+      "such a Horrible Day\\footnote{1}, but I won't be Made Useless", 
+      RedClothForTex.new("such a [[Horrible Day]][1], but I won't be [[Made Useless]]").to_tex
+    )
+  end
+  
+  def test_subsection_depth
+    assert_equal "\\subsubsection*{Hello}", RedClothForTex.new("h4. Hello").to_tex
+  end
+end
--- a/test/unit/page_renderer_test.rb
+++ b/test/unit/page_renderer_test.rb
@ -57,12 +57,12 @@ class PageRendererTest < Test::Unit::TestCase
    set_web_property :markup, :markdown
  
    assert_markup_parsed_as(
-        %{<h1>My Headline</h1>\n\n<p>that <span class="newWikiWord">} +
+        %{<h1 id="my_headline">My Headline</h1>\n\n<p>that <span class="newWikiWord">} +
        %{Smart Engine GUI<a href="../show/SmartEngineGUI">?</a></span></p>}, 
        "My Headline\n===========\n\nthat SmartEngineGUI")
  
    assert_markup_parsed_as(
-        %{<h1>My Headline</h1>\n\n<p>that <span class="newWikiWord">} +
+        %{<h1 id="my_headline">My Headline</h1>\n\n<p>that <span class="newWikiWord">} +
        %{Smart Engine GUI<a href="../show/SmartEngineGUI">?</a></span></p>}, 
        "#My Headline#\n\nthat SmartEngineGUI")
  
@ -77,7 +77,7 @@ class PageRendererTest < Test::Unit::TestCase
  
    assert_markup_parsed_as(
        %{<p>This is a code block:</p>\n\n<pre><code>def a_method(arg)\n} +
-        %{return ThatWay\n</code></pre>\n\n<p>Nice!</p>}, 
+        %{return ThatWay</code></pre>\n\n<p>Nice!</p>}, 
        code_block)
  end
  
@ -105,10 +105,10 @@ class PageRendererTest < Test::Unit::TestCase
    
    set_web_property :markup, :markdown
    assert_markup_parsed_as(
-      "<h1>Markdown heading</h1>\n\n" +
+      "<h1 id=\"markdown_heading\">Markdown heading</h1>\n\n" +
      "<p>h2. Textile heading</p>\n\n" +
      "<p><em>some</em> <strong>text</strong> <em>with</em> -styles-</p>\n\n" +
-      "<ul>\n<li>list 1</li>\n<li>list 2</li>\n</ul>",
+      "<ul>\n<li>list 1</li>\n\n<li>list 2</li>\n</ul>",
      textile_and_markdown)
    
    set_web_property :markup, :textile
--- a/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
@ -148,6 +148,18 @@ module HTML5lib
      input
  ]

+  CDATA_ELEMENTS = %w[title textarea]
+
+  RCDATA_ELEMENTS = %w[
+    style
+    script
+    xmp
+    iframe
+    noembed
+    noframes
+    noscript
+  ]
+
  BOOLEAN_ATTRIBUTES = {
    :global => %w[irrelevant],
    'style' => %w[scoped],
--- a/vendor/plugins/HTML5lib/lib/html5lib/filters.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/filters.rb
@ -0,0 +1 @@
+require 'html5lib/filters/optionaltags'
--- a/vendor/plugins/HTML5lib/lib/html5lib/filters/base.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/filters/base.rb
@ -0,0 +1,10 @@
+require 'delegate'
+require 'enumerator'
+
+module HTML5lib
+  module Filters
+    class Base < SimpleDelegator
+      include Enumerable
+    end
+  end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/filters/inject_meta_charset.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/filters/inject_meta_charset.rb
@ -0,0 +1,62 @@
+require 'html5lib/filters/base'
+
+module HTML5lib
+  module Filters
+    class InjectMetaCharset < Base
+      def initialize(source, encoding)
+        super(source)
+        @encoding = encoding
+      end
+
+      def each
+        state = :pre_head
+        meta_found = @encoding.nil?
+        pending = []
+
+        __getobj__.each do |token|
+          case token[:type]
+          when :StartTag
+            state = :in_head if token[:name].downcase == "head"
+
+          when :EmptyTag
+            if token[:name].downcase == "meta"
+              if token[:data].any? {|name,value| name=='charset'}
+                # replace charset with actual encoding
+                attrs=Hash[*token[:data].flatten]
+                attrs['charset'] = @encoding
+                token[:data] = attrs.to_a.sort
+                meta_found = true
+              end
+
+            elsif token[:name].downcase == "head" and not meta_found
+              # insert meta into empty head
+              yield({:type => :StartTag, :name => "head", :data => {}})
+              yield({:type => :EmptyTag, :name => "meta",
+                     :data => {"charset" => @encoding}})
+              yield({:type => :EndTag, :name => "head"})
+              meta_found = true
+              next
+            end
+
+          when :EndTag
+            if token[:name].downcase == "head" and pending.any?
+              # insert meta into head (if necessary) and flush pending queue
+              yield pending.shift
+              yield({:type => :EmptyTag, :name => "meta",
+                     :data => {"charset" => @encoding}}) if not meta_found
+              yield pending.shift while pending.any?
+              meta_found = true
+              state = :post_head
+            end
+          end
+
+          if state == :in_head
+            pending << token
+          else
+            yield token
+          end
+        end
+      end
+    end
+  end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/filters/optionaltags.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/filters/optionaltags.rb
@ -0,0 +1,199 @@
+require 'html5lib/constants'
+require 'html5lib/filters/base'
+
+module HTML5lib
+  module Filters
+
+    class OptionalTagFilter < Base
+      def slider
+        previous1 = previous2 = nil
+        __getobj__.each do |token|
+          yield previous2, previous1, token if previous1 != nil
+          previous2 = previous1
+          previous1 = token
+        end
+        yield previous2, previous1, nil
+      end
+
+      def each
+        slider do |previous, token, nexttok|
+          type = token[:type]
+          if type == :StartTag
+            yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
+          elsif type == :EndTag
+            yield token unless is_optional_end(token[:name], nexttok)
+          else
+            yield token
+          end
+        end
+      end
+
+      def is_optional_start(tagname, previous, nexttok)
+        type = nexttok ? nexttok[:type] : nil
+        if tagname == 'html'
+          # An html element's start tag may be omitted if the first thing
+          # inside the html element is not a space character or a comment.
+          return ![:Comment, :SpaceCharacters].include?(type)
+        elsif tagname == 'head'
+          # A head element's start tag may be omitted if the first thing
+          # inside the head element is an element.
+          return type == :StartTag
+        elsif tagname == 'body'
+          # A body element's start tag may be omitted if the first thing
+          # inside the body element is not a space character or a comment,
+          # except if the first thing inside the body element is a script
+          # or style element and the node immediately preceding the body
+          # element is a head element whose end tag has been omitted.
+          if [:Comment, :SpaceCharacters].include?(type)
+            return false
+          elsif type == :StartTag
+            # XXX: we do not look at the preceding event, so we never omit
+            # the body element's start tag if it's followed by a script or
+            # a style element.
+            return !%w[script style].include?(nexttok[:name])
+          else
+            return true
+          end
+        elsif tagname == 'colgroup'
+          # A colgroup element's start tag may be omitted if the first thing
+          # inside the colgroup element is a col element, and if the element
+          # is not immediately preceeded by another colgroup element whose
+          # end tag has been omitted.
+          if type == :StartTag
+            # XXX: we do not look at the preceding event, so instead we never
+            # omit the colgroup element's end tag when it is immediately
+            # followed by another colgroup element. See is_optional_end.
+            return nexttok[:name] == "col"
+          else
+            return false
+          end
+        elsif tagname == 'tbody'
+          # A tbody element's start tag may be omitted if the first thing
+          # inside the tbody element is a tr element, and if the element is
+          # not immediately preceeded by a tbody, thead, or tfoot element
+          # whose end tag has been omitted.
+          if type == :StartTag
+            # omit the thead and tfoot elements' end tag when they are
+            # immediately followed by a tbody element. See is_optional_end.
+            if previous and previous[:type] == :EndTag and \
+              %w(tbody thead tfoot).include?(previous[:name])
+              return false
+            end
+
+            return nexttok[:name] == 'tr'
+          else
+            return false
+          end
+        end
+      return false
+      end
+
+      def is_optional_end(tagname, nexttok)
+        type = nexttok ? nexttok[:type] : nil
+        if %w[html head body].include?(tagname)
+          # An html element's end tag may be omitted if the html element
+          # is not immediately followed by a space character or a comment.
+          return ![:Comment, :SpaceCharacters].include?(type)
+        elsif %w[li optgroup option tr].include?(tagname)
+          # A li element's end tag may be omitted if the li element is
+          # immediately followed by another li element or if there is
+          # no more content in the parent element.
+          # An optgroup element's end tag may be omitted if the optgroup
+          # element is immediately followed by another optgroup element,
+          # or if there is no more content in the parent element.
+          # An option element's end tag may be omitted if the option
+          # element is immediately followed by another option element,
+          # or if there is no more content in the parent element.
+          # A tr element's end tag may be omitted if the tr element is
+          # immediately followed by another tr element, or if there is
+          # no more content in the parent element.
+          if type == :StartTag
+            return nexttok[:name] == tagname
+          else
+            return type == :EndTag || type == nil
+          end
+        elsif %w(dt dd).include?(tagname)
+          # A dt element's end tag may be omitted if the dt element is
+          # immediately followed by another dt element or a dd element.
+          # A dd element's end tag may be omitted if the dd element is
+          # immediately followed by another dd element or a dt element,
+          # or if there is no more content in the parent element.
+          if type == :StartTag
+            return %w(dt dd).include?(nexttok[:name])
+          elsif tagname == 'dd'
+            return type == :EndTag || type == nil
+          else
+            return false
+          end
+        elsif tagname == 'p'
+          # A p element's end tag may be omitted if the p element is
+          # immediately followed by an address, blockquote, dl, fieldset,
+          # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
+          # or ul  element, or if there is no more content in the parent
+          # element.
+          if type == :StartTag
+            return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
+                      h6 hr menu ol p pre table ul).include?(nexttok[:name])
+          else
+            return type == :EndTag || type == nil
+          end
+        elsif tagname == 'colgroup'
+          # A colgroup element's end tag may be omitted if the colgroup
+          # element is not immediately followed by a space character or
+          # a comment.
+          if [:Comment, :SpaceCharacters].include?(type)
+            return false
+          elsif type == :StartTag
+            # XXX: we also look for an immediately following colgroup
+            # element. See is_optional_start.
+            return nexttok[:name] != 'colgroup'
+          else
+            return true
+          end
+        elsif %w(thead tbody).include? tagname
+          # A thead element's end tag may be omitted if the thead element
+          # is immediately followed by a tbody or tfoot element.
+          # A tbody element's end tag may be omitted if the tbody element
+          # is immediately followed by a tbody or tfoot element, or if
+          # there is no more content in the parent element.
+          # A tfoot element's end tag may be omitted if the tfoot element
+          # is immediately followed by a tbody element, or if there is no
+          # more content in the parent element.
+          # XXX: we never omit the end tag when the following element is
+          # a tbody. See is_optional_start.
+          if type == :StartTag
+            return %w(tbody tfoot).include?(nexttok[:name])
+          elsif tagname == 'tbody'
+            return (type == :EndTag or type == nil)
+          else
+            return false
+          end
+        elsif tagname == 'tfoot'
+          # A tfoot element's end tag may be omitted if the tfoot element
+          # is immediately followed by a tbody element, or if there is no
+          # more content in the parent element.
+          # XXX: we never omit the end tag when the following element is
+          # a tbody. See is_optional_start.
+          if type == :StartTag
+            return nexttok[:name] == 'tbody'
+          else
+            return type == :EndTag || type == nil
+          end
+        elsif %w(td th).include? tagname
+          # A td element's end tag may be omitted if the td element is
+          # immediately followed by a td or th element, or if there is
+          # no more content in the parent element.
+          # A th element's end tag may be omitted if the th element is
+          # immediately followed by a td or th element, or if there is
+          # no more content in the parent element.
+          if type == :StartTag
+            return %w(td th).include?(nexttok[:name])
+          else
+            return type == :EndTag || type == nil
+          end
+        end
+        return false
+      end
+    end
+  end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/filters/sanitizer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/filters/sanitizer.rb
@ -0,0 +1,15 @@
+require 'html5lib/filters/base'
+require 'html5lib/sanitizer'
+
+module HTML5lib
+  module Filters
+    class HTMLSanitizeFilter < Base
+      include HTMLSanitizeModule
+      def each
+        __getobj__.each do |token|
+          yield(sanitize_token(token))
+        end
+      end
+    end
+  end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/filters/whitespace.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/filters/whitespace.rb
@ -0,0 +1,36 @@
+require 'html5lib/constants'
+require 'html5lib/filters/base'
+
+module HTML5lib
+  module Filters
+    class WhitespaceFilter < Base
+
+      SPACE_PRESERVE_ELEMENTS = %w[pre textarea] + RCDATA_ELEMENTS
+      SPACES = /[#{SPACE_CHARACTERS.join('')}]+/m
+
+      def each
+        preserve = 0
+        __getobj__.each do |token|
+          case token[:type]
+          when :StartTag
+            if preserve > 0 or SPACE_PRESERVE_ELEMENTS.include?(token[:name])
+              preserve += 1
+            end
+
+          when :EndTag
+            preserve -= 1 if preserve > 0
+
+          when :SpaceCharacters
+            next if preserve == 0
+
+          when :Characters
+            token[:data] = token[:data].sub(SPACES,' ') if preserve == 0
+          end
+
+          yield token
+        end
+      end
+    end
+  end
+end
+
--- a/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
@ -1,5 +1,4 @@
 require 'cgi'
-require 'html5lib/filters'

 module HTML5lib

@ -176,15 +175,6 @@ module HTML5lib
    end
  end

-  class HTMLSanitizeFilter < Filters::Base
-    include HTMLSanitizeModule
-    def each
-      __getobj__.each do |token|
-        yield(sanitize_token(token))
-      end
-    end
-  end
-
  class HTMLSanitizer < HTMLTokenizer
    include HTMLSanitizeModule
    def each
--- a/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/serializer.rb
@ -1,5 +1,4 @@
 require 'html5lib/constants'
-require 'html5lib/filters'

 module HTML5lib

@ -7,7 +6,7 @@ module HTML5lib
    CDATA_ELEMENTS = %w[style script xmp iframe noembed noframes noscript]

    def self.serialize(stream, options = {})
-      new(options).serialize(stream)
+      new(options).serialize(stream, options[:encoding])
    end

    def initialize(options={})
@ -40,20 +39,25 @@ module HTML5lib

    def serialize(treewalker, encoding=nil)
      in_cdata = false
-
-
      @errors = []
+
      if encoding and @inject_meta_charset
-        treewalker = filter_inject_meta_charset(treewalker, encoding)
+        require 'html5lib/filters/inject_meta_charset'
+        treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
      end
+
      if @strip_whitespace
-        treewalker = filter_whitespace(treewalker)
+        require 'html5lib/filters/whitespace'
+        treewalker = Filters::WhitespaceFilter.new(treewalker)
      end
+
      if @sanitize
-        require 'html5lib/sanitizer'
-        treewalker = HTMLSanitizeFilter.new(treewalker)
+        require 'html5lib/filters/sanitizer'
+        treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
      end
+
      if @omit_optional_tags
+        require 'html5lib/filters/optionaltags'
        treewalker = Filters::OptionalTagFilter.new(treewalker)
      end

@ -62,25 +66,14 @@ module HTML5lib
        type = token[:type]
        if type == :Doctype
          doctype = "<!DOCTYPE %s>" % token[:name]
-          if encoding
-            result << doctype.encode(encoding)
-          else
          result << doctype
-          end

        elsif [:Characters, :SpaceCharacters].include? type
          if type == :SpaceCharacters or in_cdata
            if in_cdata and token[:data].include?("</")
              serializeError(_("Unexpected </ in CDATA"))
            end
-            if encoding
-              result << token[:data].encode(encoding, errors || "strict")
-            else
            result << token[:data]
-            end
-          elsif encoding
-            result << token[:data].replace("&", "&amp;").
-              encode(encoding, unicode_encode_errors)
          else
            result << token[:data].
                        gsub("&", "&amp;").
@ -97,7 +90,6 @@ module HTML5lib
          end
          attributes = []
          for k,v in attrs = token[:data].to_a.sort
-            k = k.encode(encoding) if encoding
            attributes << ' '

            attributes << k
@ -111,9 +103,6 @@ module HTML5lib
                quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
              end
              v = v.gsub("&", "&amp;")
-              if encoding
-                v = v.encode(encoding, unicode_encode_errors)
-              end
              if quote_attr
                quote_char = @quote_char
                if @use_best_quote_char
@ -141,11 +130,7 @@ module HTML5lib
              attributes << "/"
            end
          end
-          if encoding
-            result << "<%s%s>" % [name.encode(encoding), attributes.join('')]
-          else
          result << "<%s%s>" % [name, attributes.join('')]
-          end

        elsif type == :EndTag
          name = token[:name]
@ -155,32 +140,28 @@ module HTML5lib
            serializeError(_("Unexpected child element of a CDATA element"))
          end
          end_tag = "</#{name}>"
-          end_tag = end_tag.encode(encoding) if encoding
          result << end_tag

        elsif type == :Comment
          data = token[:data]
          serializeError(_("Comment contains --")) if data.index("--")
          comment = "<!--%s-->" % token[:data]
-          if encoding
-            comment = comment.encode(encoding, unicode_encode_errors)
-          end
          result << comment

        else
          serializeError(token[:data])
        end
      end
+
+      if encoding and encoding != 'utf-8'
+        require 'iconv'
+        Iconv.iconv(encoding, 'utf-8', result.join('')).first
+      else
        result.join('')
      end
+    end

-    def render(treewalker, encoding=nil)
-      if encoding
-        return "".join(list(serialize(treewalker, encoding)))
-      else
-        return "".join(list(serialize(treewalker)))
-      end
-    end
+    alias :render :serialize

    def serializeError(data="XXX ERROR MESSAGE NEEDED")
      # XXX The idea is to make data mandatory.
@ -189,22 +170,6 @@ module HTML5lib
        raise SerializeError
      end
    end
-
-    def filter_inject_meta_charset(treewalker, encoding)
-      done = false
-      for token in treewalker
-        if not done and token[:type] == :StartTag \
-            and token[:name].lower() == "head"
-          yield({:type => :EmptyTag, :name => "meta", \
-                 :data => {"charset" => encoding}})
-        end
-        yield token
-      end
-    end
-
-    def filter_whitespace(treewalker)
-      raise NotImplementedError
-    end
  end

  # Error in serialized tree
--- a/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treewalkers/base.rb
@ -27,13 +27,13 @@ module TokenConstructor
    end

    def text(data)
-        if data =~ /^([#{SPACE_CHARACTERS.join('')}]+)/
+        if data =~ /\A([#{SPACE_CHARACTERS.join('')}]+)/m
          yield({:type => :SpaceCharacters, :data => $1})
          data = data[$1.length .. -1]
          return if data.empty?
        end

-        if data =~ /([#{SPACE_CHARACTERS.join('')}]+)$/
+        if data =~ /([#{SPACE_CHARACTERS.join('')}]+)\Z/m
          yield({:type => :Characters, :data => data[0 ... -$1.length]})
          yield({:type => :SpaceCharacters, :data => $1})
        else
--- a/vendor/plugins/HTML5lib/parse.rb
+++ b/vendor/plugins/HTML5lib/parse.rb
@ -59,7 +59,7 @@ def printOutput(parser, document, opts)
    require 'html5lib/treewalkers'
    tokens = HTML5lib::TreeWalkers[opts.treebuilder].new(document)
    require 'html5lib/serializer'
-    print HTML5lib::HTMLSerializer.serialize(tokens, :encoding=>'utf-8')
+    print HTML5lib::HTMLSerializer.serialize(tokens, opts.serializer)
  when :hilite
    print document.hilite
  when :tree
@ -80,11 +80,16 @@ require 'ostruct'
 options = OpenStruct.new
 options.profile = false
 options.time = false
-options.output = :tree
+options.output = :html
 options.treebuilder = 'simpletree'
 options.error = false
 options.encoding = false
 options.parsemethod = :parse
+options.serializer = {
+  :encoding => 'utf-8',
+  :omit_optional_tags => false,
+  :inject_meta_charset => false
+}

 require 'optparse'
 opts = OptionParser.new do |opts|
@ -96,14 +101,6 @@ opts = OptionParser.new do |opts|
    options.time = time
  end
    
-  opts.on("--[no-]tree", "Do not print output tree") do |tree|
-    if tree
-      options.output = :tree
-    else
-      options.output = nil
-    end
-  end
-  
  opts.on("-b", "--treebuilder NAME") do |treebuilder|
    options.treebuilder = treebuilder
  end
@ -116,13 +113,17 @@ opts = OptionParser.new do |opts|
    options.parsemethod = :parseFragment
  end

+  opts.on("--tree", "output as debug tree") do |tree|
+    options.output = :tree
+  end
+  
  opts.on("-x", "--xml", "output as xml") do |xml|
    options.output = :xml
    options.treebuilder = "rexml"
  end
  
-  opts.on("--html", "Output as html") do |html|
-    options.output = :html
+  opts.on("--[no-]html", "Output as html") do |html|
+    options.output = (html ? :html : nil)
  end
  
  opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
@ -133,6 +134,22 @@ opts = OptionParser.new do |opts|
    options.encoding = encoding
  end

+  opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
+    options.serializer[:inject_meta_charset] = inject
+  end
+
+  opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
+    options.serializer[:strip_whitespace] = strip
+  end
+
+  opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
+    options.serializer[:sanitize] = sanitize
+  end
+
+  opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
+    options.serializer[:omit_optional_tags] = omit
+  end
+
  opts.on_tail("-h", "--help", "Show this message") do
    puts opts
    exit
--- a/vendor/plugins/HTML5lib/testdata/serializer/injectmeta.test
+++ b/vendor/plugins/HTML5lib/testdata/serializer/injectmeta.test
@ -0,0 +1,39 @@
+{"tests": [
+
+{"description": "no encoding",
+ "options": {"inject_meta_charset": true},
+ "input": [["EmptyTag", "head", {}]],
+ "expected": ["<head>"]
+},
+
+{"description": "empytag head",
+ "options": {"inject_meta_charset": true, "encoding":"utf-8"},
+ "input": [["EmptyTag", "head", {}]],
+ "expected": ["<head><meta charset=utf-8>"]
+},
+
+{"description": "head w/title",
+ "options": {"inject_meta_charset": true, "encoding":"utf-8"},
+ "input": [["StartTag", "head", {}], ["StartTag","title",{}], ["Characters", "foo"],["EndTag", "title"], ["EndTag", "head"]],
+ "expected": ["<head><meta charset=utf-8><title>foo</title>"]
+},
+
+{"description": "head w/meta-charset",
+ "options": {"inject_meta_charset": true, "encoding":"utf-8"},
+ "input": [["StartTag", "head", {}], ["EmptyTag","meta",{"charset":"ascii"}], ["EndTag", "head"]],
+ "expected": ["<head><meta charset=utf-8>"]
+},
+
+{"description": "head w/robots",
+ "options": {"inject_meta_charset": true, "encoding":"utf-8"},
+ "input": [["StartTag", "head", {}], ["EmptyTag","meta",{"name":"robots","content":"noindex"}], ["EndTag", "head"]],
+ "expected": ["<head><meta charset=utf-8><meta content=noindex name=robots>"]
+},
+
+{"description": "head w/robots & charset",
+ "options": {"inject_meta_charset": true, "encoding":"utf-8"},
+ "input": [["StartTag", "head", {}], ["EmptyTag","meta",{"name":"robots","content":"noindex"}], ["EmptyTag","meta",{"charset":"ascii"}], ["EndTag", "head"]],
+ "expected": ["<head><meta content=noindex name=robots><meta charset=utf-8>"]
+}
+
+]}
--- a/vendor/plugins/HTML5lib/tests/test_serializer.rb
+++ b/vendor/plugins/HTML5lib/tests/test_serializer.rb
@ -24,7 +24,7 @@ class JsonWalker < HTML5lib::TreeWalkers::Base
      when 'Doctype'
        yield doctype(token[1])
      else
-        raise ValueError("Unknown token type: " + type)
+        raise "Unknown token type: " + token[0]
      end
    end
  end
@ -37,7 +37,10 @@ class Html5SerializeTestcase < Test::Unit::TestCase
    tests['tests'].each_with_index do |test, index|

      define_method "test_#{test_name}_#{index+1}" do
-        next if test_name == 'whitespace' #TODO
+        if test["options"] and test["options"]["encoding"]
+          test["options"][:encoding] = test["options"]["encoding"]
+        end
+
        result = HTML5lib::HTMLSerializer.
          serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
        expected = test["expected"]
--- a/vendor/plugins/maruku/lib/maruku/output/to_html.rb
+++ b/vendor/plugins/maruku/lib/maruku/output/to_html.rb
@ -157,6 +157,7 @@ Example:
 	# Render to an HTML fragment (returns a REXML document tree)
 	def to_html_tree
 		div = Element.new 'div'
+			div.attributes['class'] = 'maruku_wrapper_div'
                        children_to_html.each do |e|
                                div << e
                        end