Updated to Latest HTML5lib

Synced with latest HTML5lib. Added some RDoc-compatible documentation to the sanitizer.
2007-06-08 17:26:00 -05:00 · 2007-06-08 17:26:00 -05:00 · 3bf560c3b3
commit 3bf560c3b3
parent 8badd0766a
7 changed files with 127 additions and 93 deletions
--- a/lib/sanitize.rb
+++ b/lib/sanitize.rb
@ -29,6 +29,8 @@ module Sanitize
 #
 # Unless otherwise specified, the string is assumed to be utf-8 encoded.
 # By default, the output is a string. But, optionally, you can return a REXML tree.
 # The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
 # (REXML trees are always utf-8 encoded.)
  def sanitize_xhtml(html, options = {})
    @encoding = 'utf-8'
    @treebuilder = TreeBuilders::REXML::TreeBuilder
@ -55,6 +57,8 @@ module Sanitize
 #
 # Unless otherwise specified, the string is assumed to be utf-8 encoded.
 # By default, the output is a string. But, optionally, you can return a REXML tree.
 # The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding.
 # (REXML trees are always utf-8 encoded.)
  def sanitize_html(html, options = {})
    @encoding = 'utf-8'
    @treebuilder = TreeBuilders::REXML::TreeBuilder
--- a/lib/string_utils.rb
+++ b/lib/string_utils.rb
@ -2,6 +2,12 @@
 class String
 # Check whether a string is valid utf-8
 #
 # :call-seq:
 #    string.is_utf8?    -> boolean
 #
 # returns true if the sequence of bytes in string is valid utf-8
   def is_utf8?
     self =~  /^(
         [\x09\x0A\x0D\x20-\x7E]            # ASCII
@ -2138,10 +2144,21 @@ class String
 	'zeetrf' => '&#x02128;'
  }
 # Converts XHTML+MathML named entities to Numeric Character References
 #
 #  :call-seq:
 #     string.to_ncr  -> string
 #
    def to_ncr
       self.gsub(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
    end
 # Converts XHTML+MathML named entities to Numeric Character References
 #
 #  :call-seq:
 #     string.to_ncr!  -> str or nil
 #
 # Substitution is done in-place.
    def to_ncr!
       self.gsub!(/&(?:(lt|gt|amp|quot|apos)|[a-zA-Z0-9]+);/){|s| $1 ? s : s.convert_to_ncr}
    end
@ -2159,6 +2176,14 @@ end
 require 'rexml/element'
 module REXML
  class Element
 # Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References
 #
 #  :call-seq:
 #     elt.to_ncr  -> REXML::Element
 #
 # REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you
 # access the resulting REXML document.
    def to_ncr
      XPath.each(self, '//*') { |el|
        el.texts.each_index  {|i|
--- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_body_phase.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser/in_body_phase.rb
@ -41,14 +41,14 @@ module HTML5lib
      super(parser, tree)
      # for special handling of whitespace in <pre>
-      @processSpaceCharactersPre = false
+      @processSpaceCharactersDropNewline = false
    end
-    def processSpaceCharactersPre(data)
+    def processSpaceCharactersDropNewline(data)
      #Sometimes (start of <pre> blocks) we want to drop leading newlines
-      @processSpaceCharactersPre = false
+      @processSpaceCharactersDropNewline = false
      if (data.length > 0 and data[0] == ?\n and 
-        @tree.openElements[-1].name == 'pre' and
+        %w[pre textarea].include?(@tree.openElements[-1].name) and
        not @tree.openElements[-1].hasContent)
        data = data[1..-1]
      end
@ -56,8 +56,8 @@ module HTML5lib
    end
    def processSpaceCharacters(data)
-      if @processSpaceCharactersPre
+      if @processSpaceCharactersDropNewline
-        processSpaceCharactersPre(data)
+        processSpaceCharactersDropNewline(data)
      else
        super(data)
      end
@ -98,7 +98,7 @@ module HTML5lib
    def startTagCloseP(name, attributes)
      endTagP('p') if in_scope?('p')
      @tree.insertElement(name, attributes)
-      @processSpaceCharactersPre = true if name == 'pre'
+      @processSpaceCharactersDropNewline = true if name == 'pre'
    end
    def startTagForm(name, attributes)
@ -248,6 +248,7 @@ module HTML5lib
      # XXX Form element pointer checking here as well...
      @tree.insertElement(name, attributes)
      @parser.tokenizer.contentModelFlag = :RCDATA
      @processSpaceCharactersDropNewline = true
    end
    # iframe, noembed noframes, noscript(if scripting enabled)
@ -312,7 +313,7 @@ module HTML5lib
    def endTagBlock(name)
      #Put us back in the right whitespace handling mode
-      @processSpaceCharactersPre = false if name == 'pre'
+      @processSpaceCharactersDropNewline = false if name == 'pre'
      @tree.generateImpliedEndTags if in_scope?(name)
--- a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
@ -34,7 +34,7 @@ module HTML5lib
      options.each { |name, value| instance_variable_set("@#{name}", value) }
      # List of where new lines occur
-      @new_lines = []
+      @new_lines = [0]
      # Raw Stream
      @raw_stream = open_stream(source)
@ -55,26 +55,28 @@ module HTML5lib
      # Read bytes from stream decoding them into Unicode
      uString = @raw_stream.read
-      unless @char_encoding == 'utf-8'
+      if @char_encoding == 'windows-1252'
        @win1252 = true
      elsif @char_encoding != 'utf-8'
        begin
          require 'iconv'
-          uString = Iconv.iconv('utf-8', @char_encoding, uString)[0]
+          begin
            uString = Iconv.iconv('utf-8', @char_encoding, uString).first
          rescue
            @win1252 = true
          end
        rescue LoadError
-        rescue Exception
+          @win1252 = true
        end
      end
      # Normalize newlines and null characters
      uString.gsub!(/\r\n?/, "\n")
      uString.gsub!("\x00", [0xFFFD].pack('U'))
      # Convert the unicode string into a list to be used as the data stream
      @data_stream = uString
      @queue = []
      # Reset position in the list to read from
-      reset
+      @tell = 0
    end
    # Produces a file object from source.
@ -136,10 +138,10 @@ module HTML5lib
    def detect_bom
      bom_dict = {
        "\xef\xbb\xbf" => 'utf-8',
-        "\xff\xfe" => 'utf16le',
+        "\xff\xfe" => 'utf-16le',
-        "\xfe\xff" => 'utf16be',
+        "\xfe\xff" => 'utf-16be',
-        "\xff\xfe\x00\x00" => 'utf32le',
+        "\xff\xfe\x00\x00" => 'utf-32le',
-        "\x00\x00\xfe\xff" => 'utf32be'
+        "\x00\x00\xfe\xff" => 'utf-32be'
      }
      # Go to beginning of file and read in 4 bytes
@ -175,68 +177,72 @@ module HTML5lib
      return parser.get_encoding
    end
    def determine_new_lines
      # Looks through the stream to find where new lines occur so
      # the position method can tell where it is.
      @new_lines.push(0)
      (0...@data_stream.length).each { |i| @new_lines.push(i) if @data_stream[i] == ?\n }
    end
    # Returns (line, col) of the current position in the stream.
    def position
      # Generate list of new lines first time around
      determine_new_lines if @new_lines.empty?
      line = 0
      tell = @tell
      @new_lines.each do |pos|
-        break unless pos < tell
+        break unless pos < @tell
        line += 1
      end
-      col = tell - @new_lines[line-1] - 1
+      col = @tell - @new_lines[line-1] - 1
      return [line, col]
    end
    # Resets the position in the stream back to the start.
    def reset
      @tell = 0
    end
    # Read one character from the stream or queue if available. Return
    # EOF when EOF is reached.
    def char
      unless @queue.empty?
        return @queue.shift
      else
        c = @data_stream[@tell]
        @tell += 1
-        c = @data_stream[@tell - 1]
+
        case c
-        when 0xC2 .. 0xDF
+        when 0x01 .. 0x7F
-          if @data_stream[@tell .. @tell] =~ /[\x80-\xBF]/
+          if c == 0x0D
-            @tell += 1
+            # normalize newlines
-            @data_stream[@tell-2..@tell-1]
+            @tell += 1 if @data_stream[@tell] == 0x0A
-          else
+            c = 0x0A
            [0xFFFD].pack('U')
          end
-        when 0xE0 .. 0xEF
+
-          if @data_stream[@tell .. @tell+1] =~ /[\x80-\xBF]{2}/
+          # record where newlines occur so that the position method
-            @tell += 2
+          # can tell where it is
-            @data_stream[@tell-3..@tell-1]
+          @new_lines << @tell-1 if c == 0x0A
          c.chr
        when 0x80 .. 0xBF
          if !@win1252
            [0xFFFD].pack('U') # invalid utf-8
          elsif c <= 0x9f
            [ENTITIES_WINDOWS1252[c-0x80]].pack('U')
          else
-            [0xFFFD].pack('U')
+            "\xC2" + c.chr # convert to utf-8
          end
-        when 0xF0 .. 0xF3
+
-          if @data_stream[@tell .. @tell+2] =~ /[\x80-\xBF]{3}/
+        when 0xC0 .. 0xFF
-            @tell += 3
+          if @win1252
-            @data_stream[@tell-4..@tell-1]
+            "\xC3" + (c-64).chr # convert to utf-8
          elsif @data_stream[@tell-1 .. -1] =~ /^
                ( [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
                |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
                | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
                |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
                |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
                | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
                |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
                )/x
            @tell += $1.length - 1
            $1
          else
-            [0xFFFD].pack('U')
+            [0xFFFD].pack('U') # invalid utf-8
          end
        when 0x00
          [0xFFFD].pack('U') # null characters are invalid
        else
-          begin
+          :EOF
            c.chr
          rescue
            :EOF
          end
        end
      end
    end
@ -247,28 +253,19 @@ module HTML5lib
    def chars_until(characters, opposite=false)
      char_stack = [char]
-      unless char_stack[0] == :EOF
+      while char_stack.last != :EOF
-        while (characters.include? char_stack[-1]) == opposite
+        break unless (characters.include?(char_stack.last)) == opposite
-          unless @queue.empty?
+        char_stack.push(char)
            # First from the queue
            char_stack.push(@queue.shift)
            break if char_stack[-1] == :EOF
          else
            # Then the rest
            begin
              @tell += 1
              char_stack.push(@data_stream[@tell-1].chr)
            rescue
              char_stack.push(:EOF)
              break
            end
          end
        end
      end
      # Put the character stopped on back to the front of the queue
      # from where it came.
-      @queue.insert(0, char_stack.pop)
+      c = char_stack.pop
      if c == :EOF or @data_stream[@tell-1] == c[0]
        @tell -= 1
      else
        @queue.insert(0, c)
      end
      return char_stack.join('')
    end
  end
--- a/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
@ -68,7 +68,6 @@ module HTML5lib
    # to return we yield the token which pauses processing until the next token
    # is requested.
    def each
      @stream.reset
      @tokenQueue = []
      # Start processing. When EOF is reached @state will return false
      # instead of true and the loop will terminate.
--- a/vendor/plugins/HTML5lib/tests/test_sanitizer.rb
+++ b/vendor/plugins/HTML5lib/tests/test_sanitizer.rb
@ -12,11 +12,11 @@ class SanitizeTest < Test::Unit::TestCase
  include HTML5lib
  def sanitize_xhtml stream
-    XHTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
+    XHTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).join('').gsub(/'/,'"')
  end
  def sanitize_html stream
-    HTMLParser.parseFragment(stream, :tokenizer => HTMLSanitizer).join('').gsub(/'/,'"')
+    HTMLParser.parseFragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8'}).join('').gsub(/'/,'"')
  end
  def sanitize_rexml stream
@ -259,5 +259,9 @@ class SanitizeTest < Test::Unit::TestCase
      sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
    assert_equal "<p>\360\235\222\265 \360\235\224\270</p>",
      sanitize_rexml("<p>&#x1d4b5; &#x1d538;</p>")
    assert_equal "<p><tspan>\360\235\224\270</tspan> a</p>",
      sanitize_html("<p><tspan>\360\235\224\270</tspan> a</p>")
    assert_equal "<p><tspan>\360\235\224\270</tspan> a</p>",
      sanitize_rexml("<p><tspan>\360\235\224\270</tspan> a</p>")
  end
 end
--- a/vendor/plugins/HTML5lib/tests/test_stream.rb
+++ b/vendor/plugins/HTML5lib/tests/test_stream.rb
@ -6,7 +6,7 @@ class HTMLInputStreamTest < Test::Unit::TestCase
  include HTML5lib
  def test_char_ascii
-    stream = HTMLInputStream.new("'")
+    stream = HTMLInputStream.new("'", :encoding=>'ascii')
    assert_equal('ascii', stream.char_encoding)
    assert_equal("'", stream.char)
  end
@ -17,11 +17,21 @@ class HTMLInputStreamTest < Test::Unit::TestCase
  end
  def test_char_utf8
-    stream = HTMLInputStream.new("\xe2\x80\x98")
+    stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
    assert_equal('utf-8', stream.char_encoding)
    assert_equal("\xe2\x80\x98", stream.char)
  end
  def test_char_win1252
    stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
    assert_equal('windows-1252', stream.char_encoding)
    assert_equal("\xc2\xa2", stream.char)
    assert_equal("\xc3\x85", stream.char)
    assert_equal("\xc3\xb1", stream.char)
    assert_equal("\xe2\x80\x99", stream.char)
    assert_equal("\xe2\x80\xa0", stream.char)
  end
  def test_bom
    stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
    assert_equal('utf-8', stream.char_encoding)
@ -31,12 +41,6 @@ class HTMLInputStreamTest < Test::Unit::TestCase
  begin
    require 'iconv'
    def test_char_win1252
      stream = HTMLInputStream.new("\x91")
      assert_equal('windows-1252', stream.char_encoding)
      assert_equal("\xe2\x80\x98", stream.char)
    end
    def test_utf_16
      stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
      assert(stream.char_encoding, 'utf-16-le')
@ -51,10 +55,10 @@ class HTMLInputStreamTest < Test::Unit::TestCase
    assert_equal(0, stream.instance_eval {@tell})
    assert_equal("a\nbb\n", stream.chars_until('c'))
    assert_equal(6, stream.instance_eval {@tell})
-    assert_equal([3,1], stream.position)
+    assert_equal([3,0], stream.position)
    assert_equal("ccc\ndddd", stream.chars_until('x'))
    assert_equal(14, stream.instance_eval {@tell})
-    assert_equal([4,5], stream.position)
+    assert_equal([4,4], stream.position)
-    assert_equal([0,1,4,8], stream.instance_eval {@new_lines})
+    assert_equal([0,1,5,9], stream.instance_eval {@new_lines})
  end
 end