HTML5lib Bug

Fixed a bug in the HTML5lib tokenizer (affects S5 slideshows).
Some miscellaneous code cleanup. In particular, don't bother with zapping control characters;
instead, rely on is_utf8? method to raise an exception (which we do anyway).
This commit is contained in:
Jacques Distler 2007-09-06 10:40:48 -05:00
parent f482036683
commit 5b182bd228
6 changed files with 33 additions and 8 deletions

View file

@ -227,13 +227,13 @@ class WikiController < ApplicationController
def save
render(:status => 404, :text => 'Undefined page name') and return if @page_name.nil?
author_name = params['author'].delete("\x01-\x08\x0B\x0C\x0E-\x1F")
author_name = params['author']
author_name = 'AnonymousCoward' if author_name =~ /^\s*$/
raise "Your name was not valid utf-8" if !author_name.is_utf8?
cookies['author'] = { :value => author_name, :expires => Time.utc(2030) }
begin
the_content = params['content'].delete("\x01-\x08\x0B\x0C\x0E-\x1F")
the_content = params['content']
raise "Your content was not valid utf-8" if !the_content.is_utf8?
filter_spam(the_content)
if @page
@ -294,16 +294,16 @@ class WikiController < ApplicationController
def s5
if @web.markup == :markdownMML
my_content = Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
my_content = Maruku.new(@page.content.delete("\r"),
{:math_enabled => true, :math_numbered => ['\\[','\\begin{equation}'], :content_only => true,
:author => @page.author, :title => @page.plain_name})
@s5_content = sanitize_xhtml(my_content.to_s5.to_ncr)
@s5_content = sanitize_xhtml(my_content.to_s5)
@s5_theme = my_content.s5_theme
elsif @web.markup == :markdown
my_content = Maruku.new(@page.content.delete("\r\x01-\x08\x0B\x0C\x0E-\x1F"),
my_content = Maruku.new(@page.content.delete("\r"),
{:math_enabled => false, :content_only => true,
:author => @page.author, :title => @page.plain_name})
@s5_content = sanitize_xhtml(my_content.to_s5.to_ncr)
@s5_content = sanitize_xhtml(my_content.to_s5)
@s5_theme = my_content.s5_theme
else
@s5_content = "S5 not supported with this text filter"

View file

@ -1,4 +1,5 @@
require 'chunks/chunk'
require 'sanitize'
# The category chunk looks for "category: news" on a line by
# itself and parses the terms after the ':' as categories.
@ -8,6 +9,7 @@ require 'chunks/chunk'
#
# Category lines can be hidden using ':category: news', for example
class Category < Chunk::Abstract
CATEGORY_PATTERN = /^(:)?category\s*:(.*)$/i
def self.pattern() CATEGORY_PATTERN end
@ -16,7 +18,8 @@ class Category < Chunk::Abstract
def initialize(match_data, content)
super(match_data, content)
@hidden = match_data[1]
@list = match_data[2].split(',').map { |c| html_escape(c.strip) }
@list = match_data[2].split(',').map { |c| c.to_s.is_utf8? ? html_escape(c.strip) : nil }
@list.compact!
@unmask_text = ''
if @hidden
@unmask_text = ''

View file

@ -78,6 +78,7 @@ module Chunk
string.gsub( /&/, "&amp;" ).
gsub( /</, "&lt;" ).
gsub( />/, "&gt;" ).
gsub( /'/, "&#39;" ).
gsub( /"/, "&quot;" )
end

View file

@ -217,7 +217,7 @@ module HTML5
# This method replaces the need for "entityInAttributeValueState".
def process_entity_in_attribute
entity = consume_entity(true)
entity = consume_entity()
if entity
@current_token[:data][-1][1] += entity
else

View file

@ -405,5 +405,25 @@
"name": "xul",
"input": "<p style=\"-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')\">fubar</p>",
"output": "<p style=''>fubar</p>"
},
{
"name": "quotes_in_attributes",
"input": "<img src='foo' title='\"foo\" bar' />",
"rexml": "<img src='foo' title='\"foo\" bar' />",
"output": "<img title='&quot;foo&quot; bar' src='foo'/>"
},
{
"name": "named_entities_in_attributes",
"input": "<img src='foo' title='&quot;foo&quot; bar' />",
"rexml": "<img src='foo' title='\"foo\" bar' />",
"output": "<img title='&quot;foo&quot; bar' src='foo'/>"
},
{
"name": "NCRs_in_attributes",
"input": "<img src='foo' title='&#x22;foo&#x22; bar' />",
"rexml": "<img src='foo' title='\"foo\" bar' />",
"output": "<img title='&quot;foo&quot; bar' src='foo'/>"
}
]

View file

@ -11,6 +11,7 @@ module MaRuKu
string.gsub( /&/, "&amp;" ).
gsub( /</, "&lt;" ).
gsub( />/, "&gt;" ).
gsub( /'/, "&#39;" ).
gsub( /"/, "&quot;" )
end