HTML5lib is Back.
Synced with latest version of HTML5lib, which fixes problem with Astral plane characters. I should really do some tests, but the HTML5lib Sanitizer seems to be 2-5 times slower than the old sanitizer.
This commit is contained in:
parent
e1a6827f1f
commit
4dd70af5ae
39 changed files with 4843 additions and 5576 deletions
|
@ -1,62 +1,62 @@
|
|||
require 'html5lib/constants'
|
||||
|
||||
class TokenizerTestParser
|
||||
def initialize(tokenizer)
|
||||
@tokenizer = tokenizer
|
||||
def initialize(tokenizer)
|
||||
@tokenizer = tokenizer
|
||||
end
|
||||
|
||||
def parse
|
||||
@outputTokens = []
|
||||
|
||||
debug = nil
|
||||
for token in @tokenizer
|
||||
debug = token.inspect if token[:type] == :ParseError
|
||||
send ('process' + token[:type].to_s), token
|
||||
end
|
||||
|
||||
def parse
|
||||
@outputTokens = []
|
||||
return @outputTokens
|
||||
end
|
||||
|
||||
debug = nil
|
||||
for token in @tokenizer
|
||||
debug = token.inspect if token[:type] == :ParseError
|
||||
send ('process' + token[:type].to_s), token
|
||||
end
|
||||
def processDoctype(token)
|
||||
@outputTokens.push(["DOCTYPE", token[:name], token[:data]])
|
||||
end
|
||||
|
||||
return @outputTokens
|
||||
def processStartTag(token)
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
|
||||
def processEmptyTag(token)
|
||||
if not HTML5lib::VOID_ELEMENTS.include? token[:name]
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
|
||||
def processDoctype(token)
|
||||
@outputTokens.push(["DOCTYPE", token[:name], token[:data]])
|
||||
def processEndTag(token)
|
||||
if token[:data].length > 0
|
||||
self.processParseError(token)
|
||||
end
|
||||
@outputTokens.push(["EndTag", token[:name]])
|
||||
end
|
||||
|
||||
def processStartTag(token)
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
def processComment(token)
|
||||
@outputTokens.push(["Comment", token[:data]])
|
||||
end
|
||||
|
||||
def processEmptyTag(token)
|
||||
if not HTML5lib::VOID_ELEMENTS.include? token[:name]
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
||||
end
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
|
||||
def processEndTag(token)
|
||||
if token[:data].length > 0
|
||||
self.processParseError(token)
|
||||
end
|
||||
@outputTokens.push(["EndTag", token[:name]])
|
||||
end
|
||||
alias processSpaceCharacters processCharacters
|
||||
|
||||
def processComment(token)
|
||||
@outputTokens.push(["Comment", token[:data]])
|
||||
end
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
def processEOF(token)
|
||||
end
|
||||
|
||||
alias processSpaceCharacters processCharacters
|
||||
|
||||
def processCharacters(token)
|
||||
@outputTokens.push(["Character", token[:data]])
|
||||
end
|
||||
|
||||
def processEOF(token)
|
||||
end
|
||||
|
||||
def processParseError(token)
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
def processParseError(token)
|
||||
@outputTokens.push("ParseError")
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue