HTML5lib is Back.

Synced with latest version of HTML5lib, which fixes problem with Astral plane characters. I should really do some tests, but the HTML5lib Sanitizer seems to be 2-5 times slower than the old sanitizer.
2007-05-30 10:45:52 -05:00 · 2007-05-30 10:45:52 -05:00 · 4dd70af5ae
commit 4dd70af5ae
parent e1a6827f1f
39 changed files with 4843 additions and 5576 deletions
--- a/vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb
@ -16,126 +16,126 @@ require 'html5lib/constants'

 module HTML5lib

-# liberal XML parser
-class XMLParser < HTMLParser
+  # liberal XML parser
+  class XMLParser < HTMLParser

-    def initialize(options={})
-        super options
-        @phases[:initial] = XmlRootPhase.new(self, @tree)
+    def initialize(options = {})
+      super options
+      @phases[:initial] = XmlRootPhase.new(self, @tree)
    end

    def normalizeToken(token)
-        if token[:type] == :StartTag or token[:type] == :EmptyTag
-            # We need to remove the duplicate attributes and convert attributes
-            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
+      if token[:type] == :StartTag or token[:type] == :EmptyTag
+        # We need to remove the duplicate attributes and convert attributes
+        # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}

-            token[:data] = Hash[*token[:data].reverse.flatten]
+        token[:data] = Hash[*token[:data].reverse.flatten]

-            # For EmptyTags, process both a Start and an End tag
-            if token[:type] == :EmptyTag
-                @phase.processStartTag(token[:name], token[:data])
-                token[:data] = {}
-                token[:type] = :EndTag
-            end
-
-        elsif token[:type] == :EndTag
-            if token[:data]
-               parseError(_("End tag contains unexpected attributes."))
-            end
-
-        elsif token[:type] == :Comment
-            # Rescue CDATA from the comments
-            if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
-                token[:type] = :Characters
-                token[:data] = token[:data][7 ... -2]
-            end
+        # For EmptyTags, process both a Start and an End tag
+        if token[:type] == :EmptyTag
+          @phase.processStartTag(token[:name], token[:data])
+          token[:data] = {}
+          token[:type] = :EndTag
        end

-        return token
+      elsif token[:type] == :EndTag
+        if token[:data]
+           parseError(_("End tag contains unexpected attributes."))
+        end
+
+      elsif token[:type] == :Comment
+        # Rescue CDATA from the comments
+        if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
+          token[:type] = :Characters
+          token[:data] = token[:data][7 ... -2]
+        end
+      end
+
+      return token
    end
-end
+  end

-# liberal XMTHML parser
-class XHTMLParser < XMLParser
+  # liberal XMTHML parser
+  class XHTMLParser < XMLParser

-    def initialize(options={})
-        super options
-        @phases[:initial] = InitialPhase.new(self, @tree)
-        @phases[:rootElement] = XhmlRootPhase.new(self, @tree)
+    def initialize(options = {})
+      super options
+      @phases[:initial] = InitialPhase.new(self, @tree)
+      @phases[:rootElement] = XhmlRootPhase.new(self, @tree)
    end

    def normalizeToken(token)
-        super(token)
+      super(token)

-        # ensure that non-void XHTML elements have content so that separate
-        # open and close tags are emitted
-        if token[:type]  == :EndTag and \
-            not VOID_ELEMENTS.include? token[:name] and \
-            token[:name] == @tree.openElements[-1].name and \
-            not @tree.openElements[-1].hasContent
-            @tree.insertText('') unless
-                @tree.openElements.any? {|e|
-                    e.attributes.keys.include? 'xmlns' and
-                    e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
-                }
-        end
+      # ensure that non-void XHTML elements have content so that separate
+      # open and close tags are emitted
+      if token[:type]  == :EndTag and \
+        not VOID_ELEMENTS.include? token[:name] and \
+        token[:name] == @tree.openElements[-1].name and \
+        not @tree.openElements[-1].hasContent
+        @tree.insertText('') unless
+          @tree.openElements.any? {|e|
+            e.attributes.keys.include? 'xmlns' and
+            e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
+          }
+      end

-        return token
+      return token
    end
-end
+  end

-class XhmlRootPhase < RootElementPhase
+  class XhmlRootPhase < RootElementPhase
    def insertHtmlElement
-        element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
-        @tree.openElements.push(element)
-        @tree.document.appendChild(element)
-        @parser.phase = @parser.phases[:beforeHead]
+      element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
+      @tree.openElements.push(element)
+      @tree.document.appendChild(element)
+      @parser.phase = @parser.phases[:beforeHead]
    end
-end
+  end

-class XmlRootPhase < Phase
+  class XmlRootPhase < Phase
    # Prime the Xml parser
    @start_tag_handlers = Hash.new(:startTagOther)
    @end_tag_handlers = Hash.new(:endTagOther)
    def startTagOther(name, attributes)
-        @tree.openElements.push(@tree.document)
-        element = @tree.createElement(name, attributes)
-        @tree.openElements[-1].appendChild(element)
-        @tree.openElements.push(element)
-        @parser.phase = XmlElementPhase.new(@parser,@tree)
+      @tree.openElements.push(@tree.document)
+      element = @tree.createElement(name, attributes)
+      @tree.openElements[-1].appendChild(element)
+      @tree.openElements.push(element)
+      @parser.phase = XmlElementPhase.new(@parser,@tree)
    end
    def endTagOther(name)
-        super
-        @tree.openElements.pop
+      super
+      @tree.openElements.pop
    end
-end
+  end

-class XmlElementPhase < Phase
+  class XmlElementPhase < Phase
    # Generic handling for all XML elements

    @start_tag_handlers = Hash.new(:startTagOther)
    @end_tag_handlers = Hash.new(:endTagOther)

    def startTagOther(name, attributes)
-        element = @tree.createElement(name, attributes)
-        @tree.openElements[-1].appendChild(element)
-        @tree.openElements.push(element)
+      element = @tree.createElement(name, attributes)
+      @tree.openElements[-1].appendChild(element)
+      @tree.openElements.push(element)
    end

    def endTagOther(name)
-        for node in @tree.openElements.reverse
-            if node.name == name
-                {} while @tree.openElements.pop != node
-                break
-            else
-                @parser.parseError
-            end
+      for node in @tree.openElements.reverse
+        if node.name == name
+          {} while @tree.openElements.pop != node
+          break
+        else
+          @parser.parseError
        end
+      end
    end

    def processCharacters(data)
-        @tree.insertText(data)
+      @tree.insertText(data)
    end
-end
+  end

 end