HTML5lib Bug

Fixed a bug in the HTML5lib tokenizer (affects S5 slideshows).
Some miscellaneous code cleanup. In particular, don't bother with zapping control characters;
instead, rely on is_utf8? method to raise an exception (which we do anyway).
This commit is contained in:
Jacques Distler 2007-09-06 10:40:48 -05:00
parent f482036683
commit 5b182bd228
6 changed files with 33 additions and 8 deletions

View file

@ -1,4 +1,5 @@
require 'chunks/chunk'
require 'sanitize'
# The category chunk looks for "category: news" on a line by
# itself and parses the terms after the ':' as categories.
@ -8,6 +9,7 @@ require 'chunks/chunk'
#
# Category lines can be hidden using ':category: news', for example
class Category < Chunk::Abstract
CATEGORY_PATTERN = /^(:)?category\s*:(.*)$/i
def self.pattern() CATEGORY_PATTERN end
@ -16,7 +18,8 @@ class Category < Chunk::Abstract
def initialize(match_data, content)
super(match_data, content)
@hidden = match_data[1]
@list = match_data[2].split(',').map { |c| html_escape(c.strip) }
@list = match_data[2].split(',').map { |c| c.to_s.is_utf8? ? html_escape(c.strip) : nil }
@list.compact!
@unmask_text = ''
if @hidden
@unmask_text = ''

View file

@ -78,6 +78,7 @@ module Chunk
string.gsub( /&/, "&amp;" ).
gsub( /</, "&lt;" ).
gsub( />/, "&gt;" ).
gsub( /'/, "&#39;" ).
gsub( /"/, "&quot;" )
end