Updated to Latest HTML5lib

Synced with latest HTML5lib. Added some RDoc-compatible documentation to the sanitizer.
2007-06-08 17:26:00 -05:00 · 2007-06-08 17:26:00 -05:00 · 3bf560c3b3
commit 3bf560c3b3
parent 8badd0766a
7 changed files with 127 additions and 93 deletions
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@ -29,6 +29,8 @@ module Sanitize
 #
 # Unless otherwise specified, the string is assumed to be utf-8 encoded.
 # By default, the output is a string. But, optionally, you can return a REXML tree.
+# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
+# (REXML trees are always utf-8 encoded.)
  def sanitize_xhtml(html, options = {})
    @encoding = 'utf-8'
    @treebuilder = TreeBuilders::REXML::TreeBuilder
@ -55,6 +57,8 @@ module Sanitize
 #
 # Unless otherwise specified, the string is assumed to be utf-8 encoded.
 # By default, the output is a string. But, optionally, you can return a REXML tree.
+# The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
+# (REXML trees are always utf-8 encoded.)
  def sanitize_html(html, options = {})
    @encoding = 'utf-8'
    @treebuilder = TreeBuilders::REXML::TreeBuilder
--- a/lib/string_utils.rb
+++ b/lib/string_utils.rb
@ -2,6 +2,12 @@

 class String

+# Check whether a string is valid utf-8
+#
+# :call-seq:
+#    string.is_utf8?    -> boolean
+#
+# returns true if the sequence of bytes in string is valid utf-8
   def is_utf8?
     self =~  /^(
         [\x09\x0A\x0D\x20-\x7E]            # ASCII
@ -2138,10 +2144,21 @@ class String
 	'zeetrf' => '&#x02128;'
  }

+# Converts XHTML+MathML named entities to Numeric Character References
+#
+#  :call-seq:
+#     string.to_ncr  -> string
+#
    def to_ncr
       self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
    end

+# Converts XHTML+MathML named entities to Numeric Character References
+#
+#  :call-seq:
+#     string.to_ncr!  -> str or nil
+#
+# Substitution is done in-place.
    def to_ncr!
       self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
    end
@ -2159,6 +2176,14 @@ end
 require 'rexml/element'
 module REXML
  class Element
+
+# Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
+#
+#  :call-seq:
+#     elt.to_ncr  -> REXML::Element
+#
+# REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
+# access the resulting REXML document.
    def to_ncr
      XPath.each(self, '//*') { |el|
        el.texts.each_index  {|i|
--- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_body_phase.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_body_phase.rb
@ -41,14 +41,14 @@ module HTML5lib
      super(parser, tree)

      # for special handling of whitespace in <pre>
-      @processSpaceCharactersPre = false
+      @processSpaceCharactersDropNewline = false
    end

-    def processSpaceCharactersPre(data)
+    def processSpaceCharactersDropNewline(data)
      #Sometimes (start of <pre> blocks) we want to drop leading newlines
-      @processSpaceCharactersPre = false
+      @processSpaceCharactersDropNewline = false
      if (data.length > 0 and data[0] == ?\n and 
-        @tree.openElements[-1].name == 'pre' and
+        %w[pre textarea].include?(@tree.openElements[-1].name) and
        not @tree.openElements[-1].hasContent)
        data = data[1..-1]
      end
@ -56,8 +56,8 @@ module HTML5lib
    end

    def processSpaceCharacters(data)
-      if @processSpaceCharactersPre
-        processSpaceCharactersPre(data)
+      if @processSpaceCharactersDropNewline
+        processSpaceCharactersDropNewline(data)
      else
        super(data)
      end
@ -98,7 +98,7 @@ module HTML5lib
    def startTagCloseP(name, attributes)
      endTagP('p') if in_scope?('p')
      @tree.insertElement(name, attributes)
-      @processSpaceCharactersPre = true if name == 'pre'
+      @processSpaceCharactersDropNewline = true if name == 'pre'
    end

    def startTagForm(name, attributes)
@ -248,6 +248,7 @@ module HTML5lib
      # XXX Form element pointer checking here as well...
      @tree.insertElement(name, attributes)
      @parser.tokenizer.contentModelFlag = :RCDATA
+      @processSpaceCharactersDropNewline = true
    end

    # iframe, noembed noframes, noscript(if scripting enabled)
@ -312,7 +313,7 @@ module HTML5lib

    def endTagBlock(name)
      #Put us back in the right whitespace handling mode
-      @processSpaceCharactersPre = false if name == 'pre'
+      @processSpaceCharactersDropNewline = false if name == 'pre'

      @tree.generateImpliedEndTags if in_scope?(name)

--- a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
@ -34,7 +34,7 @@ module HTML5lib
      options.each { |name, value| instance_variable_set("@#{name}", value) }

      # List of where new lines occur
-      @new_lines = []
+      @new_lines = [0]

      # Raw Stream
      @raw_stream = open_stream(source)
@ -55,26 +55,28 @@ module HTML5lib

      # Read bytes from stream decoding them into Unicode
      uString = @raw_stream.read
-      unless @char_encoding == 'utf-8'
+      if @char_encoding == 'windows-1252'
+        @win1252 = true
+      elsif @char_encoding != 'utf-8'
        begin
          require 'iconv'
-          uString = Iconv.iconv('utf-8', @char_encoding, uString)[0]
+          begin
+            uString = Iconv.iconv('utf-8', @char_encoding, uString).first
+          rescue
+            @win1252 = true
+          end
        rescue LoadError
-        rescue Exception
+          @win1252 = true
        end
      end

-      # Normalize newlines and null characters
-      uString.gsub!(/\r\n?/, "\n")
-      uString.gsub!("\x00", [0xFFFD].pack('U'))
-
      # Convert the unicode string into a list to be used as the data stream
      @data_stream = uString

      @queue = []

      # Reset position in the list to read from
-      reset
+      @tell = 0
    end

    # Produces a file object from source.
@ -136,10 +138,10 @@ module HTML5lib
    def detect_bom
      bom_dict = {
        "\xef\xbb\xbf" => 'utf-8',
-        "\xff\xfe" => 'utf16le',
-        "\xfe\xff" => 'utf16be',
-        "\xff\xfe\x00\x00" => 'utf32le',
-        "\x00\x00\xfe\xff" => 'utf32be'
+        "\xff\xfe" => 'utf-16le',
+        "\xfe\xff" => 'utf-16be',
+        "\xff\xfe\x00\x00" => 'utf-32le',
+        "\x00\x00\xfe\xff" => 'utf-32be'
      }

      # Go to beginning of file and read in 4 bytes
@ -175,68 +177,72 @@ module HTML5lib
      return parser.get_encoding
    end

-    def determine_new_lines
-      # Looks through the stream to find where new lines occur so
-      # the position method can tell where it is.
-      @new_lines.push(0)
-      (0...@data_stream.length).each { |i| @new_lines.push(i) if @data_stream[i] == ?\n }
-    end
-
    # Returns (line, col) of the current position in the stream.
    def position
-      # Generate list of new lines first time around
-      determine_new_lines if @new_lines.empty?
      line = 0
-      tell = @tell
      @new_lines.each do |pos|
-        break unless pos < tell
+        break unless pos < @tell
        line += 1
      end
-      col = tell - @new_lines[line-1] - 1
+      col = @tell - @new_lines[line-1] - 1
      return [line, col]
    end

-    # Resets the position in the stream back to the start.
-    def reset
-      @tell = 0
-    end
-
    # Read one character from the stream or queue if available. Return
    # EOF when EOF is reached.
    def char
      unless @queue.empty?
        return @queue.shift
      else
+        c = @data_stream[@tell]
        @tell += 1
-        c = @data_stream[@tell - 1]
+
        case c
-        when 0xC2 .. 0xDF
-          if @data_stream[@tell .. @tell] =~ /[\x80-\xBF]/
-            @tell += 1
-            @data_stream[@tell-2..@tell-1]
-          else
-            [0xFFFD].pack('U')
+        when 0x01 .. 0x7F
+          if c == 0x0D
+            # normalize newlines
+            @tell += 1 if @data_stream[@tell] == 0x0A
+            c = 0x0A
          end
-        when 0xE0 .. 0xEF
-          if @data_stream[@tell .. @tell+1] =~ /[\x80-\xBF]{2}/
-            @tell += 2
-            @data_stream[@tell-3..@tell-1]
+
+          # record where newlines occur so that the position method
+          # can tell where it is
+          @new_lines << @tell-1 if c == 0x0A
+
+          c.chr
+
+        when 0x80 .. 0xBF
+          if !@win1252
+            [0xFFFD].pack('U') # invalid utf-8
+          elsif c <= 0x9f
+            [ENTITIES_WINDOWS1252[c-0x80]].pack('U')
          else
-            [0xFFFD].pack('U')
+            "\xC2" + c.chr # convert to utf-8
          end
-        when 0xF0 .. 0xF3
-          if @data_stream[@tell .. @tell+2] =~ /[\x80-\xBF]{3}/
-            @tell += 3
-            @data_stream[@tell-4..@tell-1]
+
+        when 0xC0 .. 0xFF
+          if @win1252
+            "\xC3" + (c-64).chr # convert to utf-8
+          elsif @data_stream[@tell-1 .. -1] =~ /^
+                ( [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
+                |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
+                | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
+                |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
+                |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
+                | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
+                |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
+                )/x
+            @tell += $1.length - 1
+            $1
          else
-            [0xFFFD].pack('U')
+            [0xFFFD].pack('U') # invalid utf-8
          end
+
+        when 0x00
+          [0xFFFD].pack('U') # null characters are invalid
+
        else
-          begin
-            c.chr
-          rescue
-            :EOF
-          end
+          :EOF
        end
      end
    end
@ -247,28 +253,19 @@ module HTML5lib
    def chars_until(characters, opposite=false)
      char_stack = [char]

-      unless char_stack[0] == :EOF
-        while (characters.include? char_stack[-1]) == opposite
-          unless @queue.empty?
-            # First from the queue
-            char_stack.push(@queue.shift)
-            break if char_stack[-1] == :EOF
-          else
-            # Then the rest
-            begin
-              @tell += 1
-              char_stack.push(@data_stream[@tell-1].chr)
-            rescue
-              char_stack.push(:EOF)
-              break
-            end
-          end
-        end
+      while char_stack.last != :EOF
+        break unless (characters.include?(char_stack.last)) == opposite
+        char_stack.push(char)
      end

      # Put the character stopped on back to the front of the queue
      # from where it came.
-      @queue.insert(0, char_stack.pop)
+      c = char_stack.pop
+      if c == :EOF or @data_stream[@tell-1] == c[0]
+        @tell -= 1
+      else
+        @queue.insert(0, c)
+      end
      return char_stack.join('')
    end
  end
--- a/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
@ -68,7 +68,6 @@ module HTML5lib
    # to return we yield the token which pauses processing until the next token
    # is requested.
    def each
-      @stream.reset
      @tokenQueue = []
      # Start processing. When EOF is reached @state will return false
      # instead of true and the loop will terminate.
--- a/vendor/plugins/HTML5lib/tests/test_sanitizer.rb
+++ b/vendor/plugins/HTML5lib/tests/test_sanitizer.rb
@ -12,11 +12,11 @@ class SanitizeTest < Test::Unit::TestCase
  include HTML5lib

  def sanitize_xhtml stream
-    XHTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
+    XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).join('').gsub(/'/,'"')
  end

  def sanitize_html stream
-    HTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
+    HTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).join('').gsub(/'/,'"')
  end

  def sanitize_rexml stream
@ -259,5 +259,9 @@ class SanitizeTest < Test::Unit::TestCase
      sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
    assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
      sanitize_rexml("<p>&#x1d4b5; &#x1d538;</p>")
+    assert_equal "<p><tspan>\360\235\224\270</tspan> a</p>",
+      sanitize_html("<p><tspan>\360\235\224\270</tspan> a</p>")
+    assert_equal "<p><tspan>\360\235\224\270</tspan> a</p>",
+      sanitize_rexml("<p><tspan>\360\235\224\270</tspan> a</p>")
  end
 end
--- a/vendor/plugins/HTML5lib/tests/test_stream.rb
+++ b/vendor/plugins/HTML5lib/tests/test_stream.rb
@ -6,7 +6,7 @@ class HTMLInputStreamTest < Test::Unit::TestCase
  include HTML5lib

  def test_char_ascii
-    stream = HTMLInputStream.new("'")
+    stream = HTMLInputStream.new("'", :encoding=>'ascii')
    assert_equal('ascii', stream.char_encoding)
    assert_equal("'", stream.char)
  end
@ -17,11 +17,21 @@ class HTMLInputStreamTest < Test::Unit::TestCase
  end

  def test_char_utf8
-    stream = HTMLInputStream.new("\xe2\x80\x98")
+    stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
    assert_equal('utf-8', stream.char_encoding)
    assert_equal("\xe2\x80\x98", stream.char)
  end

+  def test_char_win1252
+    stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
+    assert_equal('windows-1252', stream.char_encoding)
+    assert_equal("\xc2\xa2", stream.char)
+    assert_equal("\xc3\x85", stream.char)
+    assert_equal("\xc3\xb1", stream.char)
+    assert_equal("\xe2\x80\x99", stream.char)
+    assert_equal("\xe2\x80\xa0", stream.char)
+  end
+
  def test_bom
    stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
    assert_equal('utf-8', stream.char_encoding)
@ -31,12 +41,6 @@ class HTMLInputStreamTest < Test::Unit::TestCase
  begin
    require 'iconv'

-    def test_char_win1252
-      stream = HTMLInputStream.new("\x91")
-      assert_equal('windows-1252', stream.char_encoding)
-      assert_equal("\xe2\x80\x98", stream.char)
-    end
-
    def test_utf_16
      stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
      assert(stream.char_encoding, 'utf-16-le')
@ -51,10 +55,10 @@ class HTMLInputStreamTest < Test::Unit::TestCase
    assert_equal(0, stream.instance_eval {@tell})
    assert_equal("a\nbb\n", stream.chars_until('c'))
    assert_equal(6, stream.instance_eval {@tell})
-    assert_equal([3,1], stream.position)
+    assert_equal([3,0], stream.position)
    assert_equal("ccc\ndddd", stream.chars_until('x'))
    assert_equal(14, stream.instance_eval {@tell})
-    assert_equal([4,5], stream.position)
-    assert_equal([0,1,4,8], stream.instance_eval {@new_lines})
+    assert_equal([4,4], stream.position)
+    assert_equal([0,1,5,9], stream.instance_eval {@new_lines})
  end
 end