HTML5lib Bug

Fixed a bug in the HTML5lib tokenizer (affects S5 slideshows). Some miscellaneous code cleanup. In particular, don't bother with zapping control characters; instead, rely on is_utf8? method to raise an exception (which we do anyway).
2007-09-06 10:40:48 -05:00 · 2007-09-06 10:40:48 -05:00 · 5b182bd228
commit 5b182bd228
parent f482036683
6 changed files with 33 additions and 8 deletions
--- a/app/controllers/wiki_controller.rb
+++ b/app/controllers/wiki_controller.rb
@ -227,13 +227,13 @@ class WikiController < ApplicationController
  def save
    render(:status => 404, :text => 'Undefined page name') and return if @page_name.nil?

-    author_name = params['author'].delete("\x01-\x08\x0B\x0C\x0E-\x1F")
+    author_name = params['author']
    author_name = 'AnonymousCoward' if author_name =~ /^\s*$/
    raise "Your name was not valid utf-8" if !author_name.is_utf8?
    cookies['author'] = { :value => author_name, :expires => Time.utc(2030) }
    
    begin
-      the_content = params['content'].delete("\x01-\x08\x0B\x0C\x0E-\x1F")
+      the_content = params['content']
      raise "Your content was not valid utf-8" if !the_content.is_utf8?
      filter_spam(the_content)
      if @page
@ -294,16 +294,16 @@ class WikiController < ApplicationController

  def s5
    if @web.markup == :markdownMML
-      my_content = Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
+      my_content = Maruku.new(@page.content.delete("\r"),
           {:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}'], :content_only => true,
            :author => @page.author, :title => @page.plain_name})
-      @s5_content = sanitize_xhtml(my_content.to_s5.to_ncr)
+      @s5_content = sanitize_xhtml(my_content.to_s5)
      @s5_theme = my_content.s5_theme
    elsif @web.markup == :markdown
-      my_content = Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
+      my_content = Maruku.new(@page.content.delete("\r"),
           {:math_enabled => false, :content_only => true,
            :author => @page.author, :title => @page.plain_name})
-      @s5_content = sanitize_xhtml(my_content.to_s5.to_ncr)
+      @s5_content = sanitize_xhtml(my_content.to_s5)
      @s5_theme = my_content.s5_theme
    else
      @s5_content = "S5 not supported with this text filter"
--- a/lib/chunks/category.rb
+++ b/lib/chunks/category.rb
@ -1,4 +1,5 @@
 require 'chunks/chunk'
+require 'sanitize'

 # The category chunk looks for "category: news" on a line by
 # itself and parses the terms after the ':' as categories.
@ -8,6 +9,7 @@ require 'chunks/chunk'
 #
 # Category lines can be hidden using ':category: news', for example
 class Category < Chunk::Abstract
+
  CATEGORY_PATTERN = /^(:)?category\s*:(.*)$/i
  def self.pattern() CATEGORY_PATTERN  end

@ -16,7 +18,8 @@ class Category < Chunk::Abstract
 def initialize(match_data, content)
    super(match_data, content)
    @hidden = match_data[1]
-    @list = match_data[2].split(',').map { |c| html_escape(c.strip) }
+    @list = match_data[2].split(',').map { |c| c.to_s.is_utf8? ? html_escape(c.strip) : nil }
+    @list.compact!
    @unmask_text = ''
    if @hidden
      @unmask_text = ''
--- a/lib/chunks/chunk.rb
+++ b/lib/chunks/chunk.rb
@ -78,6 +78,7 @@ module Chunk
      string.gsub( /&/, "&amp;" ).
             gsub( /</, "&lt;" ).
             gsub( />/, "&gt;" ).
+             gsub( /'/, "&#39;" ).
             gsub( /"/, "&quot;" )
    end

--- a/vendor/plugins/HTML5lib/lib/html5/tokenizer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5/tokenizer.rb
@ -217,7 +217,7 @@ module HTML5

    # This method replaces the need for "entityInAttributeValueState".
    def process_entity_in_attribute
-      entity = consume_entity(true)
+      entity = consume_entity()
      if entity
        @current_token[:data][-1][1] += entity
      else
--- a/vendor/plugins/HTML5lib/testdata/sanitizer/tests1.dat
+++ b/vendor/plugins/HTML5lib/testdata/sanitizer/tests1.dat
@ -405,5 +405,25 @@
    "name": "xul",
    "input": "<p style=\"-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')\">fubar</p>",
    "output": "<p style=''>fubar</p>"
+  },
+
+  {
+    "name": "quotes_in_attributes",
+    "input": "<img src='foo' title='\"foo\" bar' />",
+    "rexml": "<img src='foo' title='\"foo\" bar' />",
+    "output": "<img title='&quot;foo&quot; bar' src='foo'/>"
+  },
+
+  {
+    "name": "named_entities_in_attributes",
+    "input": "<img src='foo' title='&quot;foo&quot; bar' />",
+    "rexml": "<img src='foo' title='\"foo\" bar' />",
+    "output": "<img title='&quot;foo&quot; bar' src='foo'/>"
+  },
+  {
+    "name": "NCRs_in_attributes",
+    "input": "<img src='foo' title='&#x22;foo&#x22; bar' />",
+    "rexml": "<img src='foo' title='\"foo\" bar' />",
+    "output": "<img title='&quot;foo&quot; bar' src='foo'/>"
  }
 ]
--- a/vendor/plugins/maruku/lib/maruku/output/s5/to_s5.rb
+++ b/vendor/plugins/maruku/lib/maruku/output/s5/to_s5.rb
@ -11,6 +11,7 @@ module MaRuKu
 	  string.gsub( /&/, "&amp;" ).
 	         gsub( /</, "&lt;" ).
 	         gsub( />/, "&gt;" ).
+	         gsub( /'/, "&#39;" ).
 	         gsub( /"/, "&quot;" )
 	end