HTML5lib Sanitizer

Replaced native Sanitizer with HTML5lib version. Synced with latest Maruku.
2007-05-25 20:52:27 -05:00 · 2007-05-25 20:52:27 -05:00 · 6b21ac484f
commit 6b21ac484f
parent 457ec8627c
36 changed files with 6534 additions and 215 deletions
--- a/vendor/plugins/HTML5lib/lib/html5lib.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib.rb
@ -0,0 +1,11 @@
+require 'html5lib/html5parser'
+
+module HTML5lib
+    def self.parse(stream, options={})
+        HTMLParser.parse(stream, options)
+    end
+
+    def self.parseFragment(stream, options={})
+        HTMLParser.parse(stream, options)
+    end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/constants.rb
@ -0,0 +1,676 @@
+module HTML5lib
+
+class EOF < Exception; end
+
+CONTENT_MODEL_FLAGS = [
+    :PCDATA,
+    :RCDATA,
+    :CDATA,
+    :PLAINTEXT
+]
+
+SCOPING_ELEMENTS = %w[
+    button
+    caption
+    html
+    marquee
+    object
+    table
+    td
+    th
+]
+
+FORMATTING_ELEMENTS = %w[
+    a
+    b
+    big
+    em
+    font
+    i
+    nobr
+    s
+    small
+    strike
+    strong
+    tt
+    u
+]
+
+SPECIAL_ELEMENTS = %w[
+    address
+    area
+    base
+    basefont
+    bgsound
+    blockquote
+    body
+    br
+    center
+    col
+    colgroup
+    dd
+    dir
+    div
+    dl
+    dt
+    embed
+    fieldset
+    form
+    frame
+    frameset
+    h1
+    h2
+    h3
+    h4
+    h5
+    h6
+    head
+    hr
+    iframe
+    image
+    img
+    input
+    isindex
+    li
+    link
+    listing
+    menu
+    meta
+    noembed
+    noframes
+    noscript
+    ol
+    optgroup
+    option
+    p
+    param
+    plaintext
+    pre
+    script
+    select
+    spacer
+    style
+    tbody
+    textarea
+    tfoot
+    thead
+    title
+    tr
+    ul
+    wbr
+]
+
+SPACE_CHARACTERS = %W[
+    \t
+    \n
+    \x0B
+    \x0C
+    \x20
+    \r
+]
+
+TABLE_INSERT_MODE_ELEMENTS = %w[
+    table
+    tbody
+    tfoot
+    thead
+    tr
+]
+
+ASCII_LOWERCASE = ('a'..'z').to_a.join('')
+ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
+ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
+DIGITS = '0'..'9'
+HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
+
+# Heading elements need to be ordered 
+HEADING_ELEMENTS = %w[
+    h1
+    h2
+    h3
+    h4
+    h5
+    h6
+]
+
+# XXX What about event-source and command?
+VOID_ELEMENTS = %w[
+    base
+    link
+    meta
+    hr
+    br
+    img
+    embed
+    param
+    area
+    col
+    input
+]
+
+# entitiesWindows1252 has to be _ordered_ and needs to have an index.
+ENTITIES_WINDOWS1252 = [
+    8364,  # 0x80  0x20AC  EURO SIGN
+    65533, # 0x81          UNDEFINED
+    8218,  # 0x82  0x201A  SINGLE LOW-9 QUOTATION MARK
+    402,   # 0x83  0x0192  LATIN SMALL LETTER F WITH HOOK
+    8222,  # 0x84  0x201E  DOUBLE LOW-9 QUOTATION MARK
+    8230,  # 0x85  0x2026  HORIZONTAL ELLIPSIS
+    8224,  # 0x86  0x2020  DAGGER
+    8225,  # 0x87  0x2021  DOUBLE DAGGER
+    710,   # 0x88  0x02C6  MODIFIER LETTER CIRCUMFLEX ACCENT
+    8240,  # 0x89  0x2030  PER MILLE SIGN
+    352,   # 0x8A  0x0160  LATIN CAPITAL LETTER S WITH CARON
+    8249,  # 0x8B  0x2039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+    338,   # 0x8C  0x0152  LATIN CAPITAL LIGATURE OE
+    65533, # 0x8D          UNDEFINED
+    381,   # 0x8E  0x017D  LATIN CAPITAL LETTER Z WITH CARON
+    65533, # 0x8F          UNDEFINED
+    65533, # 0x90          UNDEFINED
+    8216,  # 0x91  0x2018  LEFT SINGLE QUOTATION MARK
+    8217,  # 0x92  0x2019  RIGHT SINGLE QUOTATION MARK
+    8220,  # 0x93  0x201C  LEFT DOUBLE QUOTATION MARK
+    8221,  # 0x94  0x201D  RIGHT DOUBLE QUOTATION MARK
+    8226,  # 0x95  0x2022  BULLET
+    8211,  # 0x96  0x2013  EN DASH
+    8212,  # 0x97  0x2014  EM DASH
+    732,   # 0x98  0x02DC  SMALL TILDE
+    8482,  # 0x99  0x2122  TRADE MARK SIGN
+    353,   # 0x9A  0x0161  LATIN SMALL LETTER S WITH CARON
+    8250,  # 0x9B  0x203A  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+    339,   # 0x9C  0x0153  LATIN SMALL LIGATURE OE
+    65533, # 0x9D          UNDEFINED
+    382,   # 0x9E  0x017E  LATIN SMALL LETTER Z WITH CARON
+    376    # 0x9F  0x0178  LATIN CAPITAL LETTER Y WITH DIAERESIS
+]
+
+private
+
+  def self.U n
+    [n].pack('U')
+  end
+
+public
+
+ENTITIES = {
+    "AElig" => U(0xC6),
+    "Aacute" => U(0xC1),
+    "Acirc" => U(0xC2),
+    "Agrave" => U(0xC0),
+    "Alpha" => U(0x0391),
+    "Aring" => U(0xC5),
+    "Atilde" => U(0xC3),
+    "Auml" => U(0xC4),
+    "Beta" => U(0x0392),
+    "Ccedil" => U(0xC7),
+    "Chi" => U(0x03A7),
+    "Dagger" => U(0x2021),
+    "Delta" => U(0x0394),
+    "ETH" => U(0xD0),
+    "Eacute" => U(0xC9),
+    "Ecirc" => U(0xCA),
+    "Egrave" => U(0xC8),
+    "Epsilon" => U(0x0395),
+    "Eta" => U(0x0397),
+    "Euml" => U(0xCB),
+    "Gamma" => U(0x0393),
+    "Iacute" => U(0xCD),
+    "Icirc" => U(0xCE),
+    "Igrave" => U(0xCC),
+    "Iota" => U(0x0399),
+    "Iuml" => U(0xCF),
+    "Kappa" => U(0x039A),
+    "Lambda" => U(0x039B),
+    "Mu" => U(0x039C),
+    "Ntilde" => U(0xD1),
+    "Nu" => U(0x039D),
+    "OElig" => U(0x0152),
+    "Oacute" => U(0xD3),
+    "Ocirc" => U(0xD4),
+    "Ograve" => U(0xD2),
+    "Omega" => U(0x03A9),
+    "Omicron" => U(0x039F),
+    "Oslash" => U(0xD8),
+    "Otilde" => U(0xD5),
+    "Ouml" => U(0xD6),
+    "Phi" => U(0x03A6),
+    "Pi" => U(0x03A0),
+    "Prime" => U(0x2033),
+    "Psi" => U(0x03A8),
+    "Rho" => U(0x03A1),
+    "Scaron" => U(0x0160),
+    "Sigma" => U(0x03A3),
+    "THORN" => U(0xDE),
+    "Tau" => U(0x03A4),
+    "Theta" => U(0x0398),
+    "Uacute" => U(0xDA),
+    "Ucirc" => U(0xDB),
+    "Ugrave" => U(0xD9),
+    "Upsilon" => U(0x03A5),
+    "Uuml" => U(0xDC),
+    "Xi" => U(0x039E),
+    "Yacute" => U(0xDD),
+    "Yuml" => U(0x0178),
+    "Zeta" => U(0x0396),
+    "aacute" => U(0xE1),
+    "acirc" => U(0xE2),
+    "acute" => U(0xB4),
+    "aelig" => U(0xE6),
+    "agrave" => U(0xE0),
+    "alefsym" => U(0x2135),
+    "alpha" => U(0x03B1),
+    "amp" => U(0x26),
+    "AMP" => U(0x26),
+    "and" => U(0x2227),
+    "ang" => U(0x2220),
+    "apos" => U(0x27),
+    "aring" => U(0xE5),
+    "asymp" => U(0x2248),
+    "atilde" => U(0xE3),
+    "auml" => U(0xE4),
+    "bdquo" => U(0x201E),
+    "beta" => U(0x03B2),
+    "brvbar" => U(0xA6),
+    "bull" => U(0x2022),
+    "cap" => U(0x2229),
+    "ccedil" => U(0xE7),
+    "cedil" => U(0xB8),
+    "cent" => U(0xA2),
+    "chi" => U(0x03C7),
+    "circ" => U(0x02C6),
+    "clubs" => U(0x2663),
+    "cong" => U(0x2245),
+    "copy" => U(0xA9),
+    "COPY" => U(0xA9),
+    "crarr" => U(0x21B5),
+    "cup" => U(0x222A),
+    "curren" => U(0xA4),
+    "dArr" => U(0x21D3),
+    "dagger" => U(0x2020),
+    "darr" => U(0x2193),
+    "deg" => U(0xB0),
+    "delta" => U(0x03B4),
+    "diams" => U(0x2666),
+    "divide" => U(0xF7),
+    "eacute" => U(0xE9),
+    "ecirc" => U(0xEA),
+    "egrave" => U(0xE8),
+    "empty" => U(0x2205),
+    "emsp" => U(0x2003),
+    "ensp" => U(0x2002),
+    "epsilon" => U(0x03B5),
+    "equiv" => U(0x2261),
+    "eta" => U(0x03B7),
+    "eth" => U(0xF0),
+    "euml" => U(0xEB),
+    "euro" => U(0x20AC),
+    "exist" => U(0x2203),
+    "fnof" => U(0x0192),
+    "forall" => U(0x2200),
+    "frac12" => U(0xBD),
+    "frac14" => U(0xBC),
+    "frac34" => U(0xBE),
+    "frasl" => U(0x2044),
+    "gamma" => U(0x03B3),
+    "ge" => U(0x2265),
+    "gt" => U(0x3E),
+    "GT" => U(0x3E),
+    "hArr" => U(0x21D4),
+    "harr" => U(0x2194),
+    "hearts" => U(0x2665),
+    "hellip" => U(0x2026),
+    "iacute" => U(0xED),
+    "icirc" => U(0xEE),
+    "iexcl" => U(0xA1),
+    "igrave" => U(0xEC),
+    "image" => U(0x2111),
+    "infin" => U(0x221E),
+    "int" => U(0x222B),
+    "iota" => U(0x03B9),
+    "iquest" => U(0xBF),
+    "isin" => U(0x2208),
+    "iuml" => U(0xEF),
+    "kappa" => U(0x03BA),
+    "lArr" => U(0x21D0),
+    "lambda" => U(0x03BB),
+    "lang" => U(0x2329),
+    "laquo" => U(0xAB),
+    "larr" => U(0x2190),
+    "lceil" => U(0x2308),
+    "ldquo" => U(0x201C),
+    "le" => U(0x2264),
+    "lfloor" => U(0x230A),
+    "lowast" => U(0x2217),
+    "loz" => U(0x25CA),
+    "lrm" => U(0x200E),
+    "lsaquo" => U(0x2039),
+    "lsquo" => U(0x2018),
+    "lt" => U(0x3C),
+    "LT" => U(0x3C),
+    "macr" => U(0xAF),
+    "mdash" => U(0x2014),
+    "micro" => U(0xB5),
+    "middot" => U(0xB7),
+    "minus" => U(0x2212),
+    "mu" => U(0x03BC),
+    "nabla" => U(0x2207),
+    "nbsp" => U(0xA0),
+    "ndash" => U(0x2013),
+    "ne" => U(0x2260),
+    "ni" => U(0x220B),
+    "not" => U(0xAC),
+    "notin" => U(0x2209),
+    "nsub" => U(0x2284),
+    "ntilde" => U(0xF1),
+    "nu" => U(0x03BD),
+    "oacute" => U(0xF3),
+    "ocirc" => U(0xF4),
+    "oelig" => U(0x0153),
+    "ograve" => U(0xF2),
+    "oline" => U(0x203E),
+    "omega" => U(0x03C9),
+    "omicron" => U(0x03BF),
+    "oplus" => U(0x2295),
+    "or" => U(0x2228),
+    "ordf" => U(0xAA),
+    "ordm" => U(0xBA),
+    "oslash" => U(0xF8),
+    "otilde" => U(0xF5),
+    "otimes" => U(0x2297),
+    "ouml" => U(0xF6),
+    "para" => U(0xB6),
+    "part" => U(0x2202),
+    "permil" => U(0x2030),
+    "perp" => U(0x22A5),
+    "phi" => U(0x03C6),
+    "pi" => U(0x03C0),
+    "piv" => U(0x03D6),
+    "plusmn" => U(0xB1),
+    "pound" => U(0xA3),
+    "prime" => U(0x2032),
+    "prod" => U(0x220F),
+    "prop" => U(0x221D),
+    "psi" => U(0x03C8),
+    "quot" => U(0x22),
+    "QUOT" => U(0x22),
+    "rArr" => U(0x21D2),
+    "radic" => U(0x221A),
+    "rang" => U(0x232A),
+    "raquo" => U(0xBB),
+    "rarr" => U(0x2192),
+    "rceil" => U(0x2309),
+    "rdquo" => U(0x201D),
+    "real" => U(0x211C),
+    "reg" => U(0xAE),
+    "REG" => U(0xAE),
+    "rfloor" => U(0x230B),
+    "rho" => U(0x03C1),
+    "rlm" => U(0x200F),
+    "rsaquo" => U(0x203A),
+    "rsquo" => U(0x2019),
+    "sbquo" => U(0x201A),
+    "scaron" => U(0x0161),
+    "sdot" => U(0x22C5),
+    "sect" => U(0xA7),
+    "shy" => U(0xAD),
+    "sigma" => U(0x03C3),
+    "sigmaf" => U(0x03C2),
+    "sim" => U(0x223C),
+    "spades" => U(0x2660),
+    "sub" => U(0x2282),
+    "sube" => U(0x2286),
+    "sum" => U(0x2211),
+    "sup" => U(0x2283),
+    "sup1" => U(0xB9),
+    "sup2" => U(0xB2),
+    "sup3" => U(0xB3),
+    "supe" => U(0x2287),
+    "szlig" => U(0xDF),
+    "tau" => U(0x03C4),
+    "there4" => U(0x2234),
+    "theta" => U(0x03B8),
+    "thetasym" => U(0x03D1),
+    "thinsp" => U(0x2009),
+    "thorn" => U(0xFE),
+    "tilde" => U(0x02DC),
+    "times" => U(0xD7),
+    "trade" => U(0x2122),
+    "uArr" => U(0x21D1),
+    "uacute" => U(0xFA),
+    "uarr" => U(0x2191),
+    "ucirc" => U(0xFB),
+    "ugrave" => U(0xF9),
+    "uml" => U(0xA8),
+    "upsih" => U(0x03D2),
+    "upsilon" => U(0x03C5),
+    "uuml" => U(0xFC),
+    "weierp" => U(0x2118),
+    "xi" => U(0x03BE),
+    "yacute" => U(0xFD),
+    "yen" => U(0xA5),
+    "yuml" => U(0xFF),
+    "zeta" => U(0x03B6),
+    "zwj" => U(0x200D),
+    "zwnj" => U(0x200C)
+}
+
+ENCODINGS = %w[
+    ansi_x3.4-1968
+    iso-ir-6
+    ansi_x3.4-1986
+    iso_646.irv:1991
+    ascii
+    iso646-us
+    us-ascii
+    us
+    ibm367
+    cp367
+    csascii
+    ks_c_5601-1987
+    korean
+    iso-2022-kr
+    csiso2022kr
+    euc-kr
+    iso-2022-jp
+    csiso2022jp
+    iso-2022-jp-2
+    iso-ir-58
+    chinese
+    csiso58gb231280
+    iso_8859-1:1987
+    iso-ir-100
+    iso_8859-1
+    iso-8859-1
+    latin1
+    l1
+    ibm819
+    cp819
+    csisolatin1
+    iso_8859-2:1987
+    iso-ir-101
+    iso_8859-2
+    iso-8859-2
+    latin2
+    l2
+    csisolatin2
+    iso_8859-3:1988
+    iso-ir-109
+    iso_8859-3
+    iso-8859-3
+    latin3
+    l3
+    csisolatin3
+    iso_8859-4:1988
+    iso-ir-110
+    iso_8859-4
+    iso-8859-4
+    latin4
+    l4
+    csisolatin4
+    iso_8859-6:1987
+    iso-ir-127
+    iso_8859-6
+    iso-8859-6
+    ecma-114
+    asmo-708
+    arabic
+    csisolatinarabic
+    iso_8859-7:1987
+    iso-ir-126
+    iso_8859-7
+    iso-8859-7
+    elot_928
+    ecma-118
+    greek
+    greek8
+    csisolatingreek
+    iso_8859-8:1988
+    iso-ir-138
+    iso_8859-8
+    iso-8859-8
+    hebrew
+    csisolatinhebrew
+    iso_8859-5:1988
+    iso-ir-144
+    iso_8859-5
+    iso-8859-5
+    cyrillic
+    csisolatincyrillic
+    iso_8859-9:1989
+    iso-ir-148
+    iso_8859-9
+    iso-8859-9
+    latin5
+    l5
+    csisolatin5
+    iso-8859-10
+    iso-ir-157
+    l6
+    iso_8859-10:1992
+    csisolatin6
+    latin6
+    hp-roman8
+    roman8
+    r8
+    ibm037
+    cp037
+    csibm037
+    ibm424
+    cp424
+    csibm424
+    ibm437
+    cp437
+    437
+    cspc8codepage437
+    ibm500
+    cp500
+    csibm500
+    ibm775
+    cp775
+    cspc775baltic
+    ibm850
+    cp850
+    850
+    cspc850multilingual
+    ibm852
+    cp852
+    852
+    cspcp852
+    ibm855
+    cp855
+    855
+    csibm855
+    ibm857
+    cp857
+    857
+    csibm857
+    ibm860
+    cp860
+    860
+    csibm860
+    ibm861
+    cp861
+    861
+    cp-is
+    csibm861
+    ibm862
+    cp862
+    862
+    cspc862latinhebrew
+    ibm863
+    cp863
+    863
+    csibm863
+    ibm864
+    cp864
+    csibm864
+    ibm865
+    cp865
+    865
+    csibm865
+    ibm866
+    cp866
+    866
+    csibm866
+    ibm869
+    cp869
+    869
+    cp-gr
+    csibm869
+    ibm1026
+    cp1026
+    csibm1026
+    koi8-r
+    cskoi8r
+    koi8-u
+    big5-hkscs
+    ptcp154
+    csptcp154
+    pt154
+    cp154
+    utf-7
+    utf-16be
+    utf-16le
+    utf-16
+    utf-8
+    iso-8859-13
+    iso-8859-14
+    iso-ir-199
+    iso_8859-14:1998
+    iso_8859-14
+    latin8
+    iso-celtic
+    l8
+    iso-8859-15
+    iso_8859-15
+    iso-8859-16
+    iso-ir-226
+    iso_8859-16:2001
+    iso_8859-16
+    latin10
+    l10
+    gbk
+    cp936
+    ms936
+    gb18030
+    shift_jis
+    ms_kanji
+    csshiftjis
+    euc-jp
+    gb2312
+    big5
+    csbig5
+    windows-1250
+    windows-1251
+    windows-1252
+    windows-1253
+    windows-1254
+    windows-1255
+    windows-1256
+    windows-1257
+    windows-1258
+    tis-620
+    hz-gb-2312
+]
+
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/html5parser.rb
--- a/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/inputstream.rb
@ -0,0 +1,549 @@
+require 'stringio'
+require 'html5lib/constants'
+
+module HTML5lib
+
+# Provides a unicode stream of characters to the HTMLTokenizer.
+
+# This class takes care of character encoding and removing or replacing
+# incorrect byte-sequences and also provides column and line tracking.
+
+class HTMLInputStream
+
+    attr_accessor :queue, :charEncoding
+
+    # Initialises the HTMLInputStream.
+    # 
+    # HTMLInputStream(source, [encoding]) -> Normalized stream from source
+    # for use by the HTML5Lib.
+    # 
+    # source can be either a file-object, local filename or a string.
+    # 
+    # The optional encoding parameter must be a string that indicates
+    # the encoding.  If specified, that encoding will be used,
+    # regardless of any BOM or later declaration (such as in a meta
+    # element)
+    #  
+    # parseMeta - Look for a <meta> element containing encoding information
+
+    def initialize(source, options = {})
+        @encoding = nil
+        @parseMeta = true
+        @chardet = true
+
+        options.each { |name, value| instance_variable_set("@#{name}", value) }
+
+        # List of where new lines occur
+        @newLines = []
+
+        # Raw Stream
+        @rawStream = openStream(source)
+
+        # Encoding Information
+        #Number of bytes to use when looking for a meta element with
+        #encoding information
+        @NUM_BYTES_META = 512
+        #Encoding to use if no other information can be found
+        @DEFAULT_ENCODING = 'windows-1252'
+        
+        #Detect encoding iff no explicit "transport level" encoding is supplied
+        if @encoding.nil? or not HTML5lib.isValidEncoding(@encoding)
+            @charEncoding = detectEncoding
+        else
+            @charEncoding = @encoding
+        end
+
+        # Read bytes from stream decoding them into Unicode
+        uString = @rawStream.read
+        unless @charEncoding == 'utf-8'
+            begin
+                require 'iconv'
+                uString = Iconv.iconv('utf-8', @encoding, uString)[0]
+            rescue
+            end
+        end
+
+        # Normalize newlines and null characters
+        uString.gsub!(/\r\n?/, "\n")
+        uString.gsub!("\x00", [0xFFFD].pack('U'))
+
+        # Convert the unicode string into a list to be used as the data stream
+        @dataStream = uString
+
+        @queue = []
+
+        # Reset position in the list to read from
+        reset
+    end
+
+    # Produces a file object from source.
+    #
+    # source can be either a file object, local filename or a string.
+    def openStream(source)
+        # Already an IO like object
+        if source.respond_to?(:read)
+            @stream = source
+        else
+            # Treat source as a string and wrap in StringIO
+            @stream = StringIO.new(source)
+        end
+        return @stream
+    end
+
+    def detectEncoding
+
+        #First look for a BOM
+        #This will also read past the BOM if present
+        encoding = detectBOM
+        #If there is no BOM need to look for meta elements with encoding 
+        #information
+        if encoding.nil? and @parseMeta
+            encoding = detectEncodingMeta
+        end
+        #Guess with chardet, if avaliable
+        if encoding.nil? and @chardet
+            begin
+                require 'rubygems'
+                require 'UniversalDetector' # gem install chardet
+                buffer = @rawStream.read
+                encoding = UniversalDetector::chardet(buffer)['encoding']
+                @rawStream = openStream(buffer)
+            rescue LoadError
+            end
+        end
+        # If all else fails use the default encoding
+        if encoding.nil?
+            encoding = @DEFAULT_ENCODING
+        end
+        
+        #Substitute for equivalent encodings:
+        encodingSub = {'ascii' => 'windows-1252', 'iso-8859-1' => 'windows-1252'}
+
+        if encodingSub.has_key?(encoding.downcase)
+            encoding = encodingSub[encoding.downcase]
+        end
+
+        return encoding
+    end
+
+    # Attempts to detect at BOM at the start of the stream. If
+    # an encoding can be determined from the BOM return the name of the
+    # encoding otherwise return nil
+    def detectBOM
+        bomDict = {
+            "\xef\xbb\xbf" => 'utf-8',
+            "\xff\xfe" => 'utf-16-le',
+            "\xfe\xff" => 'utf-16-be',
+            "\xff\xfe\x00\x00" => 'utf-32-le',
+            "\x00\x00\xfe\xff" => 'utf-32-be'
+        }
+
+        # Go to beginning of file and read in 4 bytes
+        @rawStream.seek(0)
+        string = @rawStream.read(4)
+        return nil unless string
+
+        # Try detecting the BOM using bytes from the string
+        encoding = bomDict[string[0...3]]          # UTF-8
+        seek = 3
+        unless encoding
+            # Need to detect UTF-32 before UTF-16
+            encoding = bomDict[string]             # UTF-32
+            seek = 4
+            unless encoding
+                encoding = bomDict[string[0...2]]  # UTF-16
+                seek = 2
+            end
+        end
+
+        #AT - move this to the caller?
+        # Set the read position past the BOM if one was found, otherwise
+        # set it to the start of the stream
+        @rawStream.seek(encoding ? seek : 0)
+
+        return encoding
+    end
+
+    # Report the encoding declared by the meta element
+    def detectEncodingMeta
+        parser = EncodingParser.new(@rawStream.read(@NUM_BYTES_META))
+        @rawStream.seek(0)
+        return parser.getEncoding
+    end
+
+    def determineNewLines
+        # Looks through the stream to find where new lines occur so
+        # the position method can tell where it is.
+        @newLines.push(0)
+        (0...@dataStream.length).each { |i| @newLines.push(i) if @dataStream[i] == ?\n }
+    end
+
+    # Returns (line, col) of the current position in the stream.
+    def position
+        # Generate list of new lines first time around
+        determineNewLines if @newLines.empty?
+        line = 0
+        tell = @tell
+        @newLines.each do |pos|
+            break unless pos < tell
+            line += 1
+        end
+        col = tell - @newLines[line-1] - 1
+        return [line, col]
+    end
+
+    # Resets the position in the stream back to the start.
+    def reset
+        @tell = 0
+    end
+
+    # Read one character from the stream or queue if available. Return
+    # EOF when EOF is reached.
+    def char
+        unless @queue.empty?
+            return @queue.shift
+        else
+            begin
+                @tell += 1
+                return @dataStream[@tell - 1].chr
+            rescue
+                return :EOF
+            end
+        end
+    end
+
+    # Returns a string of characters from the stream up to but not
+    # including any character in characters or EOF. characters can be
+    # any container that supports the in method being called on it.
+    def charsUntil(characters, opposite = false)
+        charStack = [char]
+
+        unless charStack[0] == :EOF
+            while (characters.include? charStack[-1]) == opposite
+                unless @queue.empty?
+                    # First from the queue
+                    charStack.push(@queue.shift)
+                    break if charStack[-1] == :EOF
+                else
+                    # Then the rest
+                    begin
+                        charStack.push(@dataStream[@tell].chr)
+                        @tell += 1
+                    rescue
+                        charStack.push(:EOF)
+                        break
+                    end
+                end
+            end
+        end
+
+        # Put the character stopped on back to the front of the queue
+        # from where it came.
+        @queue.insert(0, charStack.pop)
+        return charStack.join('')
+    end
+end
+
+# String-like object with an assosiated position and various extra methods
+# If the position is ever greater than the string length then an exception is raised
+class EncodingBytes < String
+
+    attr_accessor :position
+
+    def initialize(value)
+        super(value)
+        @position = -1
+    end
+    
+    def each
+        while @position < length
+            @position += 1
+            yield self[@position]
+        end
+    rescue EOF
+    end
+    
+    def currentByte
+        raise EOF if @position >= length
+        return self[@position].chr
+    end
+    
+    # Skip past a list of characters
+    def skip(chars = SPACE_CHARACTERS)
+        while chars.include?(currentByte)
+            @position += 1
+        end
+    end
+
+    # Look for a sequence of bytes at the start of a string. If the bytes 
+    # are found return true and advance the position to the byte after the 
+    # match. Otherwise return false and leave the position alone
+    def matchBytes(bytes, lower = false)
+        data = self[position ... position+bytes.length]
+        data.downcase! if lower
+        rv = (data == bytes)
+        @position += bytes.length if rv == true
+        return rv
+    end
+    
+    # Look for the next sequence of bytes matching a given sequence. If
+    # a match is found advance the position to the last byte of the match
+    def jumpTo(bytes)
+        newPosition = self[position .. -1].index(bytes)
+        if newPosition
+            @position += (newPosition + bytes.length-1)
+            return true
+        else
+            raise EOF
+        end
+    end
+    
+    # Move the pointer so it points to the next byte in a set of possible
+    # bytes
+    def findNext(byteList)
+        until byteList.include?(currentByte)
+            @position += 1
+        end
+    end
+end
+
+# Mini parser for detecting character encoding from meta elements
+class EncodingParser
+
+    # string - the data to work on for encoding detection
+    def initialize(data)
+        @data = EncodingBytes.new(data.to_s)
+        @encoding = nil
+    end
+
+    @@method_dispatch = [
+        ['<!--', :handleComment],
+        ['<meta', :handleMeta],
+        ['</', :handlePossibleEndTag],
+        ['<!', :handleOther],
+        ['<?', :handleOther],
+        ['<', :handlePossibleStartTag]
+    ]
+
+    def getEncoding
+        @data.each do |byte|
+            keepParsing = true
+            @@method_dispatch.each do |(key, method)|
+                if @data.matchBytes(key, lower = true)
+                    keepParsing = send(method)    
+                    break
+                end
+            end
+            break unless keepParsing
+        end
+        @encoding = @encoding.strip unless @encoding.nil?
+        return @encoding
+    end
+
+    # Skip over comments
+    def handleComment
+        return @data.jumpTo('-->')
+    end
+
+    def handleMeta
+        # if we have <meta not followed by a space so just keep going
+        return true unless SPACE_CHARACTERS.include?(@data.currentByte)
+
+        #We have a valid meta element we want to search for attributes
+        while true
+            #Try to find the next attribute after the current position
+            attr = getAttribute
+
+            return true if attr.nil?
+                
+            if attr[0] == 'charset'
+                tentativeEncoding = attr[1]
+                if HTML5lib.isValidEncoding(tentativeEncoding)
+                    @encoding = tentativeEncoding    
+                    return false
+                end
+            elsif attr[0] == 'content'
+                contentParser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
+                tentativeEncoding = contentParser.parse
+                if HTML5lib.isValidEncoding(tentativeEncoding)
+                    @encoding = tentativeEncoding    
+                    return false
+                end
+            end
+        end
+    end
+
+    def handlePossibleStartTag
+        return handlePossibleTag(false)
+    end
+
+    def handlePossibleEndTag
+        @data.position+=1
+        return handlePossibleTag(true)
+    end
+
+    def handlePossibleTag(endTag)
+        unless ASCII_LETTERS.include?(@data.currentByte)
+            #If the next byte is not an ascii letter either ignore this
+            #fragment (possible start tag case) or treat it according to 
+            #handleOther
+            if endTag
+                @data.position -= 1
+                handleOther
+            end
+            return true
+        end
+        
+        @data.findNext(SPACE_CHARACTERS + ['<', '>'])
+
+        if @data.currentByte == '<'
+            #return to the first step in the overall "two step" algorithm
+            #reprocessing the < byte
+            @data.position -= 1    
+        else
+            #Read all attributes
+            {} until getAttribute.nil?
+        end
+        return true
+    end
+
+    def handleOther
+        return @data.jumpTo('>')
+    end
+
+    # Return a name,value pair for the next attribute in the stream, 
+    # if one is found, or nil
+    def getAttribute
+        @data.skip(SPACE_CHARACTERS + ['/'])
+
+        if @data.currentByte == '<'
+            @data.position -= 1
+            return nil
+        elsif @data.currentByte == '>'
+            return nil
+        end
+
+        attrName = []
+        attrValue = []
+        spaceFound = false
+        #Step 5 attribute name
+        while true
+            if @data.currentByte == '=' and attrName:   
+                break
+            elsif SPACE_CHARACTERS.include?(@data.currentByte)
+                spaceFound = true
+                break
+            elsif ['/', '<', '>'].include?(@data.currentByte)
+                return [attrName.join(''), '']
+            elsif ASCII_UPPERCASE.include?(@data.currentByte)
+                attrName.push(@data.currentByte.downcase)
+            else
+                attrName.push(@data.currentByte)
+            end
+            #Step 6
+            @data.position += 1
+        end
+        #Step 7
+        if spaceFound
+            @data.skip
+            #Step 8
+            unless @data.currentByte == '='
+                @data.position -= 1
+                return [attrName.join(''), '']
+            end
+        end
+        #XXX need to advance position in both spaces and value case
+        #Step 9
+        @data.position += 1
+        #Step 10
+        @data.skip
+        #Step 11
+        if ["'", '"'].include?(@data.currentByte)
+            #11.1
+            quoteChar = @data.currentByte
+            while true
+                @data.position+=1
+                #11.3
+                if @data.currentByte == quoteChar
+                    @data.position += 1
+                    return [attrName.join(''), attrValue.join('')]
+                #11.4
+                elsif ASCII_UPPERCASE.include?(@data.currentByte)
+                    attrValue.push(@data.currentByte.downcase)
+                #11.5
+                else
+                    attrValue.push(@data.currentByte)
+                end
+            end
+        elsif ['>', '<'].include?(@data.currentByte)
+            return [attrName.join(''), '']
+        elsif ASCII_UPPERCASE.include?(@data.currentByte)
+            attrValue.push(@data.currentByte.downcase)
+        else
+            attrValue.push(@data.currentByte)
+        end
+        while true
+            @data.position +=1
+            if (SPACE_CHARACTERS + ['>', '<']).include?(@data.currentByte)
+                return [attrName.join(''), attrValue.join('')]
+            elsif ASCII_UPPERCASE.include?(@data.currentByte)
+                attrValue.push(@data.currentByte.downcase)
+            else
+                attrValue.push(@data.currentByte)
+            end
+        end
+    end
+end
+
+class ContentAttrParser
+    def initialize(data)
+        @data = data
+    end
+    def parse
+        begin
+            #Skip to the first ";"
+            @data.position = 0
+            @data.jumpTo(';')
+            @data.position += 1
+            @data.skip
+            #Check if the attr name is charset 
+            #otherwise return
+            @data.jumpTo('charset')
+            @data.position += 1
+            @data.skip
+            unless @data.currentByte == '='
+                #If there is no = sign keep looking for attrs
+                return nil
+            end
+            @data.position += 1
+            @data.skip
+            #Look for an encoding between matching quote marks
+            if ['"', "'"].include?(@data.currentByte)
+                quoteMark = @data.currentByte
+                @data.position += 1
+                oldPosition = @data.position
+                @data.jumpTo(quoteMark)
+                return @data[oldPosition ... @data.position]
+            else
+                #Unquoted value
+                oldPosition = @data.position
+                begin
+                    @data.findNext(SPACE_CHARACTERS)
+                    return @data[oldPosition ... @data.position]
+                rescue EOF
+                    #Return the whole remaining value
+                    return @data[oldPosition .. -1]
+                end
+            end
+        rescue EOF
+            return nil
+        end
+    end
+end
+
+# Determine if a string is a supported encoding
+def self.isValidEncoding(encoding)
+    (not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
+end
+
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/liberalxmlparser.rb
@ -0,0 +1,141 @@
+# Warning: this module is experimental and subject to change and even removal
+# at any time. 
+# 
+# For background/rationale, see:
+#  * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
+#  * http://tinyurl.com/ylfj8k (and follow-ups)
+# 
+# References:
+#  * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
+#  * http://wiki.whatwg.org/wiki/HtmlVsXhtml
+# 
+# @@TODO:
+# * Selectively lowercase only XHTML, but not foreign markup
+require 'html5lib/html5parser'
+require 'html5lib/constants'
+
+module HTML5lib
+
+# liberal XML parser
+class XMLParser < HTMLParser
+
+    def initialize(options={})
+        super options
+        @phases[:initial] = XmlRootPhase.new(self, @tree)
+    end
+
+    def normalizeToken(token)
+        if token[:type] == :StartTag or token[:type] == :EmptyTag
+            # We need to remove the duplicate attributes and convert attributes
+            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
+
+            token[:data] = Hash[*token[:data].reverse.flatten]
+
+            # For EmptyTags, process both a Start and an End tag
+            if token[:type] == :EmptyTag
+                @phase.processStartTag(token[:name], token[:data])
+                token[:data] = {}
+                token[:type] = :EndTag
+            end
+
+        elsif token[:type] == :EndTag
+            if token[:data]
+               parseError(_("End tag contains unexpected attributes."))
+            end
+
+        elsif token[:type] == :Comment
+            # Rescue CDATA from the comments
+            if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
+                token[:type] = :Characters
+                token[:data] = token[:data][7 ... -2]
+            end
+        end
+
+        return token
+    end
+end
+
+# liberal XMTHML parser
+class XHTMLParser < XMLParser
+
+    def initialize(options={})
+        super options
+        @phases[:initial] = InitialPhase.new(self, @tree)
+        @phases[:rootElement] = XhmlRootPhase.new(self, @tree)
+    end
+
+    def normalizeToken(token)
+        super(token)
+
+        # ensure that non-void XHTML elements have content so that separate
+        # open and close tags are emitted
+        if token[:type]  == :EndTag and \
+            not VOID_ELEMENTS.include? token[:name] and \
+            token[:name] == @tree.openElements[-1].name and \
+            not @tree.openElements[-1].hasContent
+            @tree.insertText('') unless
+                @tree.openElements.any? {|e|
+                    e.attributes.keys.include? 'xmlns' and
+                    e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
+                }
+        end
+
+        return token
+    end
+end
+
+class XhmlRootPhase < RootElementPhase
+    def insertHtmlElement
+        element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
+        @tree.openElements.push(element)
+        @tree.document.appendChild(element)
+        @parser.phase = @parser.phases[:beforeHead]
+    end
+end
+
+class XmlRootPhase < Phase
+    # Prime the Xml parser
+    @start_tag_handlers = Hash.new(:startTagOther)
+    @end_tag_handlers = Hash.new(:endTagOther)
+    def startTagOther(name, attributes)
+        @tree.openElements.push(@tree.document)
+        element = @tree.createElement(name, attributes)
+        @tree.openElements[-1].appendChild(element)
+        @tree.openElements.push(element)
+        @parser.phase = XmlElementPhase.new(@parser,@tree)
+    end
+    def endTagOther(name)
+        super
+        @tree.openElements.pop
+    end
+end
+
+class XmlElementPhase < Phase
+    # Generic handling for all XML elements
+
+    @start_tag_handlers = Hash.new(:startTagOther)
+    @end_tag_handlers = Hash.new(:endTagOther)
+
+    def startTagOther(name, attributes)
+        element = @tree.createElement(name, attributes)
+        @tree.openElements[-1].appendChild(element)
+        @tree.openElements.push(element)
+    end
+
+    def endTagOther(name)
+        for node in @tree.openElements.reverse
+            if node.name == name
+                {} while @tree.openElements.pop != node
+                break
+            else
+                @parser.parseError
+            end
+        end
+    end
+
+    def processCharacters(data)
+        @tree.insertText(data)
+    end
+end
+
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/sanitizer.rb
@ -0,0 +1,178 @@
+require 'html5lib/tokenizer'
+require 'cgi'
+
+module HTML5lib
+
+# This module provides sanitization of XHTML+MathML+SVG
+# and of inline style attributes.
+
+class HTMLSanitizer < HTMLTokenizer
+
+    ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
+        button caption center cite code col colgroup dd del dfn dir div dl dt
+        em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
+        legend li map menu ol optgroup option p pre q s samp select small span
+        strike strong sub sup table tbody td textarea tfoot th thead tr tt u
+        ul var]
+
+    MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo
+        mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub
+        msubsup msup mtable mtd mtext mtr munder munderover none]
+
+    SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
+        circle defs desc ellipse font-face font-face-name font-face-src g
+        glyph hkern image linearGradient line marker metadata missing-glyph
+        mpath path polygon polyline radialGradient rect set stop svg switch
+        text title tspan use]
+
+    ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
+        align alt axis border cellpadding cellspacing char charoff charset
+        checked cite class clear cols colspan color compact coords datetime
+        dir disabled enctype for frame headers height href hreflang hspace id
+        ismap label lang longdesc maxlength media method multiple name nohref
+        noshade nowrap prompt readonly rel rev rows rowspan rules scope
+        selected shape size span src start style summary tabindex target title
+        type usemap valign value vspace width xml:lang]
+
+    MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign
+        columnalign columnlines columnspacing columnspan depth display
+        displaystyle equalcolumns equalrows fence fontstyle fontweight frame
+        height linethickness lspace mathbackground mathcolor mathvariant
+        mathvariant maxsize minsize other rowalign rowalign rowalign rowlines
+        rowspacing rowspan rspace scriptlevel selection separator stretchy
+        width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
+
+    SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
+         arabic-form ascent attributeName attributeType baseProfile bbox begin
+         by calcMode cap-height class color color-rendering content cx cy d dx
+         dy descent display dur end fill fill-rule font-family font-size
+         font-stretch font-style font-variant font-weight from fx fy g1 g2
+         glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
+         ideographic k keyPoints keySplines keyTimes lang marker-end
+         marker-mid marker-start markerHeight markerUnits markerWidth
+         mathematical max min name offset opacity orient origin
+         overline-position overline-thickness panose-1 path pathLength points
+         preserveAspectRatio r refX refY repeatCount repeatDur
+         requiredExtensions requiredFeatures restart rotate rx ry slope stemh
+         stemv stop-color stop-opacity strikethrough-position
+         strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
+         stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
+         stroke-width systemLanguage target text-anchor to transform type u1
+         u2 underline-position underline-thickness unicode unicode-range
+         units-per-em values version viewBox visibility width widths x
+         x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
+         xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
+         xmlns:xlink y y1 y2 zoomAndPan]
+
+    ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href]
+
+    ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
+        border-bottom-color border-collapse border-color border-left-color
+        border-right-color border-top-color clear color cursor direction
+        display elevation float font font-family font-size font-style
+        font-variant font-weight height letter-spacing line-height overflow
+        pause pause-after pause-before pitch pitch-range richness speak
+        speak-header speak-numeral speak-punctuation speech-rate stress
+        text-align text-decoration text-indent unicode-bidi vertical-align
+        voice-family volume white-space width]
+
+    ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
+        brown center collapse dashed dotted fuchsia gray green !important
+        italic left lime maroon medium none navy normal nowrap olive pointer
+        purple red right solid silver teal top transparent underline white
+        yellow]
+
+    ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
+        stroke-width stroke-linecap stroke-linejoin stroke-opacity]
+
+    ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
+        telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
+
+    # subclasses may define their own versions of these constants
+    ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
+    ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
+    ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
+    ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
+    ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
+    ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
+
+    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
+    # attributes are parsed, and a restricted set, # specified by
+    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
+    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
+    # in ALLOWED_PROTOCOLS are allowed.
+    #
+    #   sanitize_html('<script> do_nasty_stuff() </script>')
+    #    => &lt;script> do_nasty_stuff() &lt;/script>
+    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
+    #    => <a>Click here for $100</a>
+    def each
+        super do |token|
+            case token[:type]
+            when :StartTag, :EndTag, :EmptyTag
+                if ALLOWED_ELEMENTS.include?(token[:name])
+                    if token.has_key? :data
+                        attrs = Hash[*token[:data].flatten]
+                        attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
+                        ATTR_VAL_IS_URI.each do |attr|
+                            val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
+                            if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
+                                attrs.delete attr
+                            end
+                        end
+                        if attrs['style']
+                            attrs['style'] = sanitize_css(attrs['style'])
+                        end
+                        token[:data] = attrs.map {|k,v| [k,v]}
+                    end
+                    yield token
+                else
+                    if token[:type] == :EndTag
+                        token[:data] = "</#{token[:name]}>"
+                    elsif token[:data]
+                        attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
+                        token[:data] = "<#{token[:name]}#{attrs}>"
+                    else
+                        token[:data] = "<#{token[:name]}>"
+                    end
+                    token[:data].insert(-2,'/') if token[:type] == :EmptyTag
+                    token[:type] = :Characters
+                    token.delete(:name)
+                    yield token
+                end
+            else
+                yield token
+            end
+
+          end
+      end
+
+      def sanitize_css(style)
+          # disallow urls
+          style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
+
+          # gauntlet
+          return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
+          return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
+
+          clean = []
+          style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
+              next if val.empty?
+              prop.downcase!
+              if ALLOWED_CSS_PROPERTIES.include?(prop)
+                  clean << "#{prop}: #{val};"
+              elsif %w[background border margin padding].include?(prop.split('-')[0])
+                  clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
+                      !ALLOWED_CSS_KEYWORDS.include?(keyword) and
+                      keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
+                  end
+              elsif ALLOWED_SVG_PROPERTIES.include?(prop)
+                  clean << "#{prop}: #{val};"
+              end
+          end
+
+          style = clean.join(' ')
+      end
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/tokenizer.rb
@ -0,0 +1,854 @@
+require 'html5lib/constants'
+require 'html5lib/inputstream'
+
+module HTML5lib
+
+# This class takes care of tokenizing HTML.
+#
+# * @currentToken
+#   Holds the token that is currently being processed.
+#
+# * @state
+#   Holds a reference to the method to be invoked... XXX
+#
+# * @states
+#   Holds a mapping between states and methods that implement the state.
+#
+# * @stream
+#   Points to HTMLInputStream object.
+
+class HTMLTokenizer
+    attr_accessor :contentModelFlag, :currentToken
+    attr_reader :stream
+
+    # XXX need to fix documentation
+
+    def initialize(stream, options={})
+        @stream = HTMLInputStream.new(stream, options)
+
+        @states = {
+            :data => :dataState,
+            :entityData => :entityDataState,
+            :tagOpen => :tagOpenState,
+            :closeTagOpen => :closeTagOpenState,
+            :tagName => :tagNameState,
+            :beforeAttributeName => :beforeAttributeNameState,
+            :attributeName => :attributeNameState,
+            :afterAttributeName => :afterAttributeNameState,
+            :beforeAttributeValue => :beforeAttributeValueState,
+            :attributeValueDoubleQuoted => :attributeValueDoubleQuotedState,
+            :attributeValueSingleQuoted => :attributeValueSingleQuotedState,
+            :attributeValueUnQuoted => :attributeValueUnQuotedState,
+            :bogusComment => :bogusCommentState,
+            :markupDeclarationOpen => :markupDeclarationOpenState,
+            :comment => :commentState,
+            :commentDash => :commentDashState,
+            :commentEnd => :commentEndState,
+            :doctype => :doctypeState,
+            :beforeDoctypeName => :beforeDoctypeNameState,
+            :doctypeName => :doctypeNameState,
+            :afterDoctypeName => :afterDoctypeNameState,
+            :bogusDoctype => :bogusDoctypeState
+        }
+
+        # Setup the initial tokenizer state
+        @contentModelFlag = :PCDATA
+        @state = @states[:data]
+
+        # The current token being created
+        @currentToken = nil
+
+        # Tokens to be processed.
+        @tokenQueue = []
+    end
+
+    # This is where the magic happens.
+    #
+    # We do our usually processing through the states and when we have a token
+    # to return we yield the token which pauses processing until the next token
+    # is requested.
+    def each
+        @stream.reset
+        @tokenQueue = []
+        # Start processing. When EOF is reached @state will return false
+        # instead of true and the loop will terminate.
+        while send @state
+            while not @tokenQueue.empty?
+                yield @tokenQueue.shift
+            end
+        end
+    end
+
+    # Below are various helper functions the tokenizer states use worked out.
+    
+    # If the next character is a '>', convert the currentToken into
+    # an EmptyTag
+
+    def processSolidusInTag
+
+        # We need to consume another character to make sure it's a ">"
+        data = @stream.char
+
+        if @currentToken[:type] == :StartTag and data == ">"
+            @currentToken[:type] = :EmptyTag
+        else
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Solidus (/) incorrectly placed in tag.")})
+        end
+
+        # The character we just consumed need to be put back on the stack so it
+        # doesn't get lost...
+        @stream.queue.push(data)
+    end
+
+    # This function returns either U+FFFD or the character based on the
+    # decimal or hexadecimal representation. It also discards ";" if present.
+    # If not present @tokenQueue.push({:type => :ParseError}") is invoked.
+
+    def consumeNumberEntity(isHex)
+
+        # XXX More need to be done here. For instance, #13 should prolly be
+        # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
+        # such. Thoughts on this appreciated.
+        allowed = DIGITS
+        radix = 10
+        if isHex
+            allowed = HEX_DIGITS
+            radix = 16
+        end
+
+        char = [0xFFFD].pack('U')
+        charStack = []
+
+        # Consume all the characters that are in range while making sure we
+        # don't hit an EOF.
+        c = @stream.char
+        while allowed.include?(c) and c != :EOF
+            charStack.push(c)
+            c = @stream.char
+        end
+
+        # Convert the set of characters consumed to an int.
+        charAsInt = charStack.join('').to_i(radix)
+
+        # If the integer is between 127 and 160 (so 128 and bigger and 159 and
+        # smaller) we need to do the "windows trick".
+        if (127...160).include? charAsInt
+            #XXX - removed parse error from windows 1252 entity for now
+            #we may want to reenable this later
+            #@tokenQueue.push({:type => :ParseError, :data =>
+            #  _("Entity used with illegal number (windows-1252 reference).")})
+
+            charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
+        end
+
+        # 0 is not a good number.
+        if charAsInt == 0
+            charAsInt = 65533
+        end
+
+        if charAsInt <= 0x10FFF
+            char = [charAsInt].pack('U')
+        else
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Numeric entity couldn't be converted to character.")})
+        end
+
+        # Discard the ; if present. Otherwise, put it back on the queue and
+        # invoke parseError on parser.
+        if c != ";"
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Numeric entity didn't end with ';'.")})
+            @stream.queue.push(c)
+        end
+
+        return char
+    end
+
+    def consumeEntity
+        char = nil
+        charStack = [@stream.char]
+        if charStack[0] == "#"
+            # We might have a number entity here.
+            charStack += [@stream.char, @stream.char]
+            if charStack.include? :EOF
+                # If we reach the end of the file put everything up to :EOF
+                # back in the queue
+                charStack = charStack[0...charStack.index(:EOF)]
+                @stream.queue+= charStack
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Numeric entity expected. Got end of file instead.")})
+            else
+                if charStack[1].downcase == "x" \
+                  and HEX_DIGITS.include? charStack[2]
+                    # Hexadecimal entity detected.
+                    @stream.queue.push(charStack[2])
+                    char = consumeNumberEntity(true)
+                elsif DIGITS.include? charStack[1]
+                    # Decimal entity detected.
+                    @stream.queue += charStack[1..-1]
+                    char = consumeNumberEntity(false)
+                else
+                    # No number entity detected.
+                    @stream.queue += charStack
+                    @tokenQueue.push({:type => :ParseError, :data =>
+                      _("Numeric entity expected but none found.")})
+                end
+            end
+        # Break out if we reach the end of the file
+        elsif charStack[0] == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Entity expected. Got end of file instead.")})
+        else
+            # At this point in the process might have named entity. Entities
+            # are stored in the global variable "entities".
+            #
+            # Consume characters and compare to these to a substring of the
+            # entity names in the list until the substring no longer matches.
+            filteredEntityList = ENTITIES.keys
+            filteredEntityList.reject! {|e| e[0].chr != charStack[0]}
+            entityName = nil
+
+            while charStack[-1] != :EOF
+                name = charStack.join('')
+                if filteredEntityList.any? {|e| e[0...name.length] == name}
+                    filteredEntityList.reject! {|e| e[0...name.length] != name}
+                    charStack.push(@stream.char)
+                else
+                    break
+                end
+
+                if ENTITIES.include? name
+                    entityName = name
+                end
+            end
+
+            if entityName != nil
+                char = ENTITIES[entityName]
+
+                # Check whether or not the last character returned can be
+                # discarded or needs to be put back.
+                if not charStack[-1] == ";"
+                    @tokenQueue.push({:type => :ParseError, :data =>
+                      _("Named entity didn't end with ';'.")})
+                    @stream.queue += charStack[entityName.length..-1]
+                end
+            else
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Named entity expected. Got none.")})
+                @stream.queue += charStack
+            end
+        end
+        return char
+    end
+
+    # This method replaces the need for "entityInAttributeValueState".
+    def processEntityInAttribute
+        entity = consumeEntity
+        if entity
+            @currentToken[:data][-1][1] += entity
+        else
+            @currentToken[:data][-1][1] += "&"
+        end
+    end
+
+    # This method is a generic handler for emitting the tags. It also sets
+    # the state to "data" because that's what's needed after a token has been
+    # emitted.
+    def emitCurrentToken
+        # Add token to the queue to be yielded
+        @tokenQueue.push(@currentToken)
+        @state = @states[:data]
+    end
+
+
+    # Below are the various tokenizer states worked out.
+
+    # XXX AT Perhaps we should have Hixie run some evaluation on billions of
+    # documents to figure out what the order of the various if and elsif
+    # statements should be.
+
+    def dataState
+        data = @stream.char
+        if data == "&" and (@contentModelFlag == :PCDATA or
+            @contentModelFlag == :RCDATA)
+            @state = @states[:entityData]
+        elsif data == "<" and @contentModelFlag != :PLAINTEXT
+            @state = @states[:tagOpen]
+        elsif data == :EOF
+            # Tokenization ends.
+            return false
+        elsif SPACE_CHARACTERS.include? data
+            # Directly after emitting a token you switch back to the "data
+            # state". At that point SPACE_CHARACTERS are important so they are
+            # emitted separately.
+            # XXX need to check if we don't need a special "spaces" flag on
+            # characters.
+            @tokenQueue.push({:type => :SpaceCharacters, :data =>
+              data + @stream.charsUntil(SPACE_CHARACTERS, true)})
+        else
+            @tokenQueue.push({:type => :Characters, :data => 
+              data + @stream.charsUntil(["&", "<"])})
+        end
+        return true
+    end
+
+    def entityDataState
+        entity = consumeEntity
+        if entity
+            @tokenQueue.push({:type => :Characters, :data => entity})
+        else
+            @tokenQueue.push({:type => :Characters, :data => "&"})
+        end
+        @state = @states[:data]
+        return true
+    end
+
+    def tagOpenState
+        data = @stream.char
+        if @contentModelFlag == :PCDATA
+            if data == "!"
+                @state = @states[:markupDeclarationOpen]
+            elsif data == "/"
+                @state = @states[:closeTagOpen]
+            elsif data != :EOF and ASCII_LETTERS.include? data
+                @currentToken =\
+                  {:type => :StartTag, :name => data, :data => []}
+                @state = @states[:tagName]
+            elsif data == ">"
+                # XXX In theory it could be something besides a tag name. But
+                # do we really care?
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected tag name. Got '>' instead.")})
+                @tokenQueue.push({:type => :Characters, :data => "<>"})
+                @state = @states[:data]
+            elsif data == "?"
+                # XXX In theory it could be something besides a tag name. But
+                # do we really care?
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected tag name. Got '?' instead (HTML doesn't " +
+                  "support processing instructions).")})
+                @stream.queue.push(data)
+                @state = @states[:bogusComment]
+            else
+                # XXX
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected tag name. Got something else instead")})
+                @tokenQueue.push({:type => :Characters, :data => "<"})
+                @stream.queue.push(data)
+                @state = @states[:data]
+            end
+        else
+            # We know the content model flag is set to either RCDATA or CDATA
+            # now because this state can never be entered with the PLAINTEXT
+            # flag.
+            if data == "/"
+                @state = @states[:closeTagOpen]
+            else
+                @tokenQueue.push({:type => :Characters, :data => "<"})
+                @stream.queue.insert(0, data)
+                @state = @states[:data]
+            end
+        end
+        return true
+    end
+
+    def closeTagOpenState
+        if (@contentModelFlag == :RCDATA or @contentModelFlag == :CDATA)
+            if @currentToken
+                charStack = []
+
+                # So far we know that "</" has been consumed. We now need to know
+                # whether the next few characters match the name of last emitted
+                # start tag which also happens to be the currentToken. We also need
+                # to have the character directly after the characters that could
+                # match the start tag name.
+                (@currentToken[:name].length + 1).times do
+                    charStack.push(@stream.char)
+                    # Make sure we don't get hit by :EOF
+                    break if charStack[-1] == :EOF
+                end
+
+                # Since this is just for checking. We put the characters back on
+                # the stack.
+                @stream.queue += charStack
+            end
+
+            if @currentToken and
+              @currentToken[:name].downcase == 
+                charStack[0...-1].join('').downcase and
+              (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? charStack[-1]
+                # Because the characters are correct we can safely switch to
+                # PCDATA mode now. This also means we don't have to do it when
+                # emitting the end tag token.
+                @contentModelFlag = :PCDATA
+            else
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected closing tag after seeing '</'. None found.")})
+                @tokenQueue.push({:type => :Characters, :data => "</"})
+                @state = @states[:data]
+
+                # Need to return here since we don't want the rest of the
+                # method to be walked through.
+                return true
+            end
+        end
+
+        if @contentModelFlag == :PCDATA
+            data = @stream.char
+            if data == :EOF
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected closing tag. Unexpected end of file.")})
+                @tokenQueue.push({:type => :Characters, :data => "</"})
+                @state = @states[:data]
+            elsif ASCII_LETTERS.include? data
+                @currentToken =\
+                  {:type => :EndTag, :name => data, :data => []}
+                @state = @states[:tagName]
+            elsif data == ">"
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
+                @state = @states[:data]
+            else
+                # XXX data can be _'_...
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected closing tag. Unexpected character '" + data + "' found.")})
+                @stream.queue.push(data)
+                @state = @states[:bogusComment]
+            end
+        end
+        return true
+    end
+
+    def tagNameState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+            @state = @states[:beforeAttributeName]
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in the tag name.")})
+            emitCurrentToken
+        elsif ASCII_LETTERS.include? data
+            @currentToken[:name] += data +\
+              @stream.charsUntil(ASCII_LETTERS, true)
+        elsif data == ">"
+            emitCurrentToken
+        elsif data == "<"
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected < character when getting the tag name.")})
+            emitCurrentToken
+        elsif data == "/"
+            processSolidusInTag
+            @state = @states[:beforeAttributeName]
+        else
+            @currentToken[:name] += data
+        end
+        return true
+    end
+
+    def beforeAttributeNameState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+            @stream.charsUntil(SPACE_CHARACTERS, true)
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file. Expected attribute name instead.")})
+            emitCurrentToken
+        elsif ASCII_LETTERS.include? data
+            @currentToken[:data].push([data, ""])
+            @state = @states[:attributeName]
+        elsif data == ">"
+            emitCurrentToken
+        elsif data == "/"
+            processSolidusInTag
+        elsif data == "<"
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected < character. Expected attribute name instead.")})
+            emitCurrentToken
+        else
+            @currentToken[:data].push([data, ""])
+            @state = @states[:attributeName]
+        end
+        return true
+    end
+
+    def attributeNameState
+        data = @stream.char
+        leavingThisState = true
+        if data == "="
+            @state = @states[:beforeAttributeValue]
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in attribute name.")})
+            emitCurrentToken
+            leavingThisState = false
+        elsif ASCII_LETTERS.include? data
+            @currentToken[:data][-1][0] += data +\
+              @stream.charsUntil(ASCII_LETTERS, true)
+            leavingThisState = false
+        elsif data == ">"
+            # XXX If we emit here the attributes are converted to a dict
+            # without being checked and when the code below runs we error
+            # because data is a dict not a list
+        elsif SPACE_CHARACTERS.include? data
+            @state = @states[:afterAttributeName]
+        elsif data == "/"
+            processSolidusInTag
+            @state = @states[:beforeAttributeName]
+        elsif data == "<"
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected < character in attribute name.")})
+            emitCurrentToken
+            leavingThisState = false
+        else
+            @currentToken[:data][-1][0] += data
+            leavingThisState = false
+        end
+
+        if leavingThisState
+            # Attributes are not dropped at this stage. That happens when the
+            # start tag token is emitted so values can still be safely appended
+            # to attributes, but we do want to report the parse error in time.
+            @currentToken[:data][0...-1].each {|name,value|
+                if @currentToken[:data][-1][0] == name
+                    @tokenQueue.push({:type => :ParseError, :data =>
+                      _("Dropped duplicate attribute on tag.")})
+                end
+            }
+            # XXX Fix for above XXX
+            if data == ">"
+                emitCurrentToken
+            end
+        end
+        return true
+    end
+
+    def afterAttributeNameState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+            @stream.charsUntil(SPACE_CHARACTERS, true)
+        elsif data == "="
+            @state = @states[:beforeAttributeValue]
+        elsif data == ">"
+            emitCurrentToken
+        elsif ASCII_LETTERS.include? data
+            @currentToken[:data].push([data, ""])
+            @state = @states[:attributeName]
+        elsif data == "/"
+            processSolidusInTag
+            @state = @states[:beforeAttributeName]
+        elsif data == "<"
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected < character. Expected = or end of tag.")})
+            emitCurrentToken
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file. Expected = or end of tag.")})
+            emitCurrentToken
+        else
+            @currentToken[:data].push([data, ""])
+            @state = @states[:attributeName]
+        end
+        return true
+    end
+
+    def beforeAttributeValueState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+            @stream.charsUntil(SPACE_CHARACTERS, true)
+        elsif data == "\""
+            @state = @states[:attributeValueDoubleQuoted]
+        elsif data == "&"
+            @state = @states[:attributeValueUnQuoted]
+            @stream.queue.push(data);
+        elsif data == "'"
+            @state = @states[:attributeValueSingleQuoted]
+        elsif data == ">"
+            emitCurrentToken
+        elsif data == "<"
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected < character. Expected attribute value.")})
+            emitCurrentToken
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file. Expected attribute value.")})
+            emitCurrentToken
+        else
+            @currentToken[:data][-1][1] += data
+            @state = @states[:attributeValueUnQuoted]
+        end
+        return true
+    end
+
+    def attributeValueDoubleQuotedState
+        data = @stream.char
+        if data == "\""
+            @state = @states[:beforeAttributeName]
+        elsif data == "&"
+            processEntityInAttribute
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in attribute value (\").")})
+            emitCurrentToken
+        else
+            @currentToken[:data][-1][1] += data +\
+              @stream.charsUntil(["\"", "&"])
+        end
+        return true
+    end
+
+    def attributeValueSingleQuotedState
+        data = @stream.char
+        if data == "'"
+            @state = @states[:beforeAttributeName]
+        elsif data == "&"
+            processEntityInAttribute
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in attribute value (').")})
+            emitCurrentToken
+        else
+            @currentToken[:data][-1][1] += data +\
+              @stream.charsUntil(["'", "&"])
+        end
+        return true
+    end
+
+    def attributeValueUnQuotedState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+            @state = @states[:beforeAttributeName]
+        elsif data == "&"
+            processEntityInAttribute
+        elsif data == ">"
+            emitCurrentToken
+        elsif data == "<"
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected < character in attribute value.")})
+            emitCurrentToken
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in attribute value.")})
+            emitCurrentToken
+        else
+            @currentToken[:data][-1][1] += data + 
+              @stream.charsUntil(["&", ">","<"] + SPACE_CHARACTERS)
+        end
+        return true
+    end
+
+    def bogusCommentState
+        # Make a new comment token and give it as value all the characters
+        # until the first > or :EOF (charsUntil checks for :EOF automatically)
+        # and emit it.
+        @tokenQueue.push(
+          {:type => :Comment, :data => @stream.charsUntil((">"))})
+
+        # Eat the character directly after the bogus comment which is either a
+        # ">" or an :EOF.
+        @stream.char
+        @state = @states[:data]
+        return true
+    end
+
+    def markupDeclarationOpenState
+        charStack = [@stream.char, @stream.char]
+        if charStack == ["-", "-"]
+            @currentToken = {:type => :Comment, :data => ""}
+            @state = @states[:comment]
+        else
+            5.times { charStack.push(@stream.char) }
+            # Put in explicit :EOF check
+            if ((not charStack.include? :EOF) and
+                charStack.join("").upcase == "DOCTYPE")
+                @currentToken =\
+                  {:type => :Doctype, :name => "", :data => true}
+                @state = @states[:doctype]
+            else
+                @tokenQueue.push({:type => :ParseError, :data =>
+                  _("Expected '--' or 'DOCTYPE'. Not found.")})
+                @stream.queue += charStack
+                @state = @states[:bogusComment]
+            end
+        end
+        return true
+    end
+
+    def commentState
+        data = @stream.char
+        if data == "-"
+            @state = @states[:commentDash]
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in comment.")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        else
+            @currentToken[:data] += data + @stream.charsUntil("-")
+        end
+        return true
+    end
+
+    def commentDashState
+        data = @stream.char
+        if data == "-"
+            @state = @states[:commentEnd]
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in comment (-)")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        else
+            @currentToken[:data] += "-" + data +\
+              @stream.charsUntil("-")
+            # Consume the next character which is either a "-" or an :EOF as
+            # well so if there's a "-" directly after the "-" we go nicely to
+            # the "comment end state" without emitting a ParseError there.
+            @stream.char
+        end
+        return true
+    end
+
+    def commentEndState
+        data = @stream.char
+        if data == ">"
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        elsif data == "-"
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected '-' after '--' found in comment.")})
+            @currentToken[:data] += data
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in comment (--).")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        else
+            # XXX
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected character in comment found.")})
+            @currentToken[:data] += "--" + data
+            @state = @states[:comment]
+        end
+        return true
+    end
+
+    def doctypeState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+            @state = @states[:beforeDoctypeName]
+        else
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("No space after literal string 'DOCTYPE'.")})
+            @stream.queue.push(data)
+            @state = @states[:beforeDoctypeName]
+        end
+        return true
+    end
+
+    def beforeDoctypeNameState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+        elsif ASCII_LOWERCASE.include? data
+            @currentToken[:name] = data.upcase
+            @state = @states[:doctypeName]
+        elsif data == ">"
+            # Character needs to be consumed per the specification so don't
+            # invoke emitCurrentTokenWithParseError with :data as argument.
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected > character. Expected DOCTYPE name.")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file. Expected DOCTYPE name.")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        else
+            @currentToken[:name] = data
+            @state = @states[:doctypeName]
+        end
+        return true
+    end
+
+    def doctypeNameState
+        data = @stream.char
+        needsDoctypeCheck = false
+        if SPACE_CHARACTERS.include? data
+            @state = @states[:afterDoctypeName]
+            needsDoctypeCheck = true
+        elsif data == ">"
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        elsif data == :EOF
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in DOCTYPE name.")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        else
+            # We can't just uppercase everything that arrives here. For
+            # instance, non-ASCII characters.
+            if ASCII_LOWERCASE.include? data
+                data = data.upcase
+            end
+            @currentToken[:name] += data
+            needsDoctypeCheck = true
+        end
+
+        # After some iterations through this state it should eventually say
+        # "HTML". Otherwise there's an error.
+        if needsDoctypeCheck and @currentToken[:name] == "HTML"
+            @currentToken[:data] = false
+        end
+        return true
+    end
+
+    def afterDoctypeNameState
+        data = @stream.char
+        if SPACE_CHARACTERS.include? data
+        elsif data == ">"
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        elsif data == :EOF
+            @currentToken[:data] = true
+            # XXX EMIT
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in DOCTYPE.")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        else
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Expected space or '>'. Got '" + data + "'")})
+            @currentToken[:data] = true
+            @state = @states[:bogusDoctype]
+        end
+        return true
+    end
+
+    def bogusDoctypeState
+        data = @stream.char
+        if data == ">"
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        elsif data == :EOF
+            # XXX EMIT
+            @stream.queue.push(data)
+            @tokenQueue.push({:type => :ParseError, :data =>
+              _("Unexpected end of file in bogus doctype.")})
+            @tokenQueue.push(@currentToken)
+            @state = @states[:data]
+        end
+        return true
+    end
+
+    def _(string); string; end
+end
+
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders.rb
@ -0,0 +1,21 @@
+module HTML5lib
+module TreeBuilders
+
+  def self.getTreeBuilder(name)
+    case name.to_s.downcase
+        when 'simpletree' then
+            require 'html5lib/treebuilders/simpletree'
+            SimpleTree::TreeBuilder
+        when 'rexml' then
+            require 'html5lib/treebuilders/rexml'
+            REXMLTree::TreeBuilder
+        when 'hpricot' then
+            require 'html5lib/treebuilders/hpricot'
+            Hpricot::TreeBuilder
+        else
+            raise "Unknown TreeBuilder #{name}"
+    end
+  end
+
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/base.rb
@ -0,0 +1,330 @@
+require 'html5lib/constants'
+
+#XXX - TODO; make the default interface more ElementTree-like rather than DOM-like
+
+module HTML5lib
+
+# The scope markers are inserted when entering buttons, object elements,
+# marquees, table cells, and table captions, and are used to prevent formatting
+# from "leaking" into tables, buttons, object elements, and marquees.
+Marker = nil
+
+module TreeBuilders
+module Base
+
+class Node
+    # The parent of the current node (or nil for the document node)
+    attr_accessor :parent
+
+    # a list of child nodes of the current node. This must 
+    # include all elements but not necessarily other node types
+    attr_accessor :childNodes
+
+    # A list of miscellaneous flags that can be set on the node
+    attr_accessor :_flags
+
+    def initialize(name)
+        @parent = nil
+        @childNodes = []
+        @_flags = []
+    end
+
+    # Insert node as a child of the current node
+    def appendChild(node)
+        raise NotImplementedError
+    end
+
+    # Insert data as text in the current node, positioned before the 
+    # start of node insertBefore or to the end of the node's text.
+    def insertText(data, insertBefore = nil)
+        raise NotImplementedError
+    end
+
+    # Insert node as a child of the current node, before refNode in the 
+    # list of child nodes. Raises ValueError if refNode is not a child of 
+    # the current node
+    def insertBefore(node, refNode)
+        raise NotImplementedError
+    end
+
+    # Remove node from the children of the current node
+    def removeChild(node)
+        raise NotImplementedError
+    end
+
+    # Move all the children of the current node to newParent. 
+    # This is needed so that trees that don't store text as nodes move the 
+    # text in the correct way
+    def reparentChildren(newParent)
+        #XXX - should this method be made more general?
+        @childNodes.each { |child| newParent.appendChild(child) }
+        @childNodes = []
+    end
+
+    # Return a shallow copy of the current node i.e. a node with the same
+    # name and attributes but with no parent or child nodes
+    def cloneNode
+        raise NotImplementedError
+    end
+
+    # Return true if the node has children or text, false otherwise
+    def hasContent
+        raise NotImplementedError
+    end
+end
+
+# Base treebuilder implementation
+class TreeBuilder
+
+    attr_accessor :openElements
+
+    attr_accessor :activeFormattingElements
+
+    attr_accessor :document
+
+    attr_accessor :headPointer
+
+    attr_accessor :formPointer
+
+    # Class to use for document root
+    documentClass = nil
+
+    # Class to use for HTML elements
+    elementClass = nil
+
+    # Class to use for comments
+    commentClass = nil
+
+    # Class to use for doctypes
+    doctypeClass = nil
+    
+    # Fragment class
+    fragmentClass = nil
+
+    def initialize
+        reset
+    end
+    
+    def reset
+        @openElements = []
+        @activeFormattingElements = []
+
+        #XXX - rename these to headElement, formElement
+        @headPointer = nil
+        @formPointer = nil
+
+        self.insertFromTable = false
+
+        @document = @documentClass.new
+    end
+
+    def elementInScope(target, tableVariant = false)
+        # Exit early when possible.
+        return true if @openElements[-1].name == target
+
+        # AT How about while true and simply set node to [-1] and set it to
+        # [-2] at the end...
+        @openElements.reverse.each do |element|
+            if element.name == target
+                return true
+            elsif element.name == 'table'
+                return false
+            elsif not tableVariant and SCOPING_ELEMENTS.include?(element.name)
+                return false
+            elsif element.name == 'html'
+                return false
+            end
+        end
+        assert false # We should never reach this point
+    end
+
+    def reconstructActiveFormattingElements
+        # Within this algorithm the order of steps described in the
+        # specification is not quite the same as the order of steps in the
+        # code. It should still do the same though.
+
+        # Step 1: stop the algorithm when there's nothing to do.
+        return unless @activeFormattingElements
+
+        # Step 2 and step 3: we start with the last element. So i is -1.
+        i = -1
+        entry = @activeFormattingElements[i]
+        return if entry == Marker or @openElements.include?(entry)
+
+        # Step 6
+        until entry == Marker or @openElements.include?(entry)
+            # Step 5: let entry be one earlier in the list.
+            i -= 1
+            begin
+                entry = @activeFormattingElements[i]
+            rescue
+                # Step 4: at this point we need to jump to step 8. By not doing
+                # i += 1 which is also done in step 7 we achieve that.
+                break
+            end
+        end
+        while true
+            # Step 7
+            i += 1
+
+            # Step 8
+            clone = @activeFormattingElements[i].cloneNode
+
+            # Step 9
+            element = insertElement(clone.name, clone.attributes)
+
+            # Step 10
+            @activeFormattingElements[i] = element
+
+            # Step 11
+            break if element == @activeFormattingElements[-1]
+        end
+    end
+
+    def clearActiveFormattingElements
+        {} until @activeFormattingElements.empty? || @activeFormattingElements.pop == Marker
+    end
+
+    # Check if an element exists between the end of the active
+    # formatting elements and the last marker. If it does, return it, else
+    # return false
+    def elementInActiveFormattingElements(name)
+        @activeFormattingElements.reverse.each do |element|
+            # Check for Marker first because if it's a Marker it doesn't have a
+            # name attribute.
+            break if element == Marker
+            return element if element.name == name
+        end
+        return false
+    end
+
+    def insertDoctype(name)
+        @document.appendChild(@doctypeClass.new(name))
+    end
+
+    def insertComment(data, parent = nil)
+        parent = @openElements[-1] if parent.nil?
+        parent.appendChild(@commentClass.new(data))
+    end
+                           
+    # Create an element but don't insert it anywhere
+    def createElement(name, attributes)
+        element = @elementClass.new(name)
+        element.attributes = attributes
+        return element
+    end
+
+    # Switch the function used to insert an element from the
+    # normal one to the misnested table one and back again
+    def insertFromTable=(value)
+        @insertFromTable = value
+        @insertElement = value ? :insertElementTable : :insertElementNormal
+    end
+
+    def insertElement(name, attributes)
+        send(@insertElement, name, attributes)
+    end
+
+    def insertElementNormal(name, attributes)
+        element = @elementClass.new(name)
+        element.attributes = attributes
+        @openElements[-1].appendChild(element)
+        @openElements.push(element)
+        return element
+    end
+
+    # Create an element and insert it into the tree
+    def insertElementTable(name, attributes)
+        element = @elementClass.new(name)
+        element.attributes = attributes
+        if TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)
+            #We should be in the InTable mode. This means we want to do
+            #special magic element rearranging
+            parent, insertBefore = getTableMisnestedNodePosition
+            if insertBefore.nil?
+                parent.appendChild(element)
+            else
+                parent.insertBefore(element, insertBefore)
+            end
+            @openElements.push(element)
+        else
+            return insertElementNormal(name, attributes)
+        end
+        return element
+    end
+
+    def insertText(data, parent = nil)
+        parent = @openElements[-1] if parent.nil?
+
+        if (not(@insertFromTable) or (@insertFromTable and not TABLE_INSERT_MODE_ELEMENTS.include?(@openElements[-1].name)))
+            parent.insertText(data)
+        else
+            #We should be in the InTable mode. This means we want to do
+            #special magic element rearranging
+            parent, insertBefore = getTableMisnestedNodePosition
+            parent.insertText(data, insertBefore)
+        end
+    end
+            
+    # Get the foster parent element, and sibling to insert before
+    # (or nil) when inserting a misnested table node
+    def getTableMisnestedNodePosition
+        #The foster parent element is the one which comes before the most
+        #recently opened table element
+        #XXX - this is really inelegant
+        lastTable = nil
+        fosterParent = nil
+        insertBefore = nil
+        @openElements.reverse.each do |element|
+            if element.name == "table"
+                lastTable = element
+                break
+            end
+        end
+        if lastTable
+            #XXX - we should really check that this parent is actually a
+            #node here
+            if lastTable.parent
+                fosterParent = lastTable.parent
+                insertBefore = lastTable
+            else
+                fosterParent = @openElements[@openElements.index(lastTable) - 1]
+            end
+        else
+            fosterParent = @openElements[0]
+        end
+        return fosterParent, insertBefore
+    end
+
+    def generateImpliedEndTags(exclude = nil)
+        name = @openElements[-1].name
+
+        if (['dd', 'dt', 'li', 'p', 'td', 'th', 'tr'].include?(name) and name != exclude)
+            @openElements.pop
+            # XXX This is not entirely what the specification says. We should
+            # investigate it more closely.
+            generateImpliedEndTags(exclude)
+        end
+    end
+
+    def getDocument
+        @document
+    end
+    
+    def getFragment
+        #assert @innerHTML
+        fragment = @fragmentClass.new
+        @openElements[0].reparentChildren(fragment)
+        return fragment
+    end
+
+    # Serialize the subtree of node in the format required by unit tests
+    # node - the node from which to start serializing
+    def testSerializer(node)
+        raise NotImplementedError
+    end
+
+end
+end
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/hpricot.rb
@ -0,0 +1,211 @@
+require 'html5lib/treebuilders/base'
+require 'hpricot'
+require 'forwardable'
+
+module HTML5lib
+module TreeBuilders
+module Hpricot
+
+class Node < Base::Node
+
+    extend Forwardable
+
+    def_delegators :@hpricot, :name
+
+    attr_accessor :hpricot
+
+    def initialize(name)
+        super(name)
+        @hpricot = self.class.hpricot_class.new name
+    end
+
+    def appendChild(node)
+        if node.kind_of?(TextNode) and childNodes.any? and childNodes.last.kind_of?(TextNode)
+            childNodes[-1].hpricot.content = childNodes[-1].hpricot.to_s + node.hpricot.to_s
+        else
+            childNodes << node
+            hpricot.children << node.hpricot
+        end
+        node.parent = self
+    end
+
+    def removeChild(node)
+       childNodes.delete(node)
+       hpricot.children.delete_at(hpricot.children.index(node.hpricot))
+       node.parent = nil
+    end
+
+    def insertText(data, before = nil)
+        if before
+            insertBefore(TextNode.new(data), before)
+        else
+            appendChild(TextNode.new(data))
+        end
+    end
+
+    def insertBefore(node, refNode)
+        index = childNodes.index(refNode)
+        if node.kind_of?(TextNode) and index > 0 and childNodes[index-1].kind_of?(TextNode)
+            childNodes[index-1].hpricot.content = childNodes[index-1].hpricot.to_s + node.hpricot.to_s
+        else
+            childNodes.insert(index, node)
+        end
+    end
+
+    def hasContent
+        childNodes.any?
+    end
+end
+
+class Element < Node
+    def self.hpricot_class
+        ::Hpricot::Elem
+    end
+
+    def initialize(name)
+        super(name)
+
+        @hpricot = ::Hpricot::Elem.new(::Hpricot::STag.new(name))
+    end
+
+    def name
+        @hpricot.stag.name
+    end
+
+    def cloneNode
+        attributes.inject(self.class.new(name)) do |node, (name, value)|
+            node.hpricot[name] = value
+            node
+        end
+    end
+
+    # A call to Hpricot::Elem#raw_attributes is built dynamically,
+    # so alterations to the returned value (a hash) will be lost.
+    #
+    # AttributeProxy works around this by forwarding :[]= calls
+    # to the raw_attributes accessor on the element start tag.
+    #
+    class AttributeProxy
+        def initialize(hpricot)
+            @hpricot = hpricot
+        end
+        def []=(k, v)
+            @hpricot.stag.send(stag_attributes_method)[k] = v
+        end
+        def stag_attributes_method
+            # STag#attributes changed to STag#raw_attributes after Hpricot 0.5
+            @hpricot.stag.respond_to?(:raw_attributes) ? :raw_attributes : :attributes
+        end
+        def method_missing(*a, &b)
+            @hpricot.attributes.send(*a, &b)
+        end
+    end
+
+    def attributes
+        AttributeProxy.new(@hpricot)
+    end
+
+    def attributes=(attrs)
+        attrs.each { |name, value| @hpricot[name] = value }
+    end
+
+    def printTree(indent = 0)
+        tree = "\n|#{' ' * indent}<#{name}>"
+        indent += 2
+        attributes.each do |name, value|
+            next if name == 'xmlns'
+            tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
+        end
+        childNodes.inject(tree) { |tree, child| tree + child.printTree(indent) }
+    end
+end
+
+class Document < Node
+    def self.hpricot_class
+        ::Hpricot::Doc
+    end
+
+    def initialize
+        super(nil)
+    end
+
+    def printTree(indent = 0)
+        childNodes.inject('#document') { |tree, child| tree + child.printTree(indent + 2) }
+    end
+end
+
+class DocumentType < Node
+    def self.hpricot_class
+        ::Hpricot::DocType
+    end
+
+    def initialize(name)
+        begin
+            super(name)
+        rescue ArgumentError # needs 3...
+        end
+
+        @hpricot = ::Hpricot::DocType.new(name, nil, nil)
+    end
+
+    def printTree(indent = 0)
+        "\n|#{' ' * indent}<!DOCTYPE #{hpricot.target}>"
+    end
+end
+
+class DocumentFragment < Element
+    def initialize
+        super('')
+    end
+
+    def printTree(indent = 0)
+        childNodes.inject('') { |tree, child| tree + child.printTree(indent+2) }
+    end
+end
+
+class TextNode < Node
+    def initialize(data)
+        @hpricot = ::Hpricot::Text.new(data)
+    end
+
+    def printTree(indent = 0)
+        "\n|#{' ' * indent}\"#{hpricot.content}\""
+    end
+end
+
+class CommentNode < Node
+    def self.hpricot_class
+        ::Hpricot::Comment
+    end
+
+    def printTree(indent = 0)
+        "\n|#{' ' * indent}<!-- #{hpricot.content} -->"
+    end
+end
+
+class TreeBuilder < Base::TreeBuilder
+    def initialize
+        @documentClass = Document
+        @doctypeClass = DocumentType
+        @elementClass = Element
+        @commentClass = CommentNode
+        @fragmentClass = DocumentFragment
+    end
+
+    def testSerializer(node)
+        node.printTree
+    end
+
+    def getDocument
+        @document.hpricot
+    end
+
+    def getFragment
+        @document = super
+        return @document.hpricot.children
+    end
+end
+
+end
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/rexml.rb
@ -0,0 +1,191 @@
+require 'html5lib/treebuilders/base'
+require 'rexml/document'
+require 'forwardable'
+
+module HTML5lib
+module TreeBuilders
+module REXMLTree
+
+class Node < Base::Node
+    extend Forwardable
+    def_delegators :@rxobj, :name, :attributes
+    attr_accessor :rxobj
+
+    def initialize name
+        super name
+        @rxobj = self.class.rxclass.new name
+    end
+
+    def appendChild node
+        if node.kind_of? TextNode and 
+          childNodes.length>0 and childNodes[-1].kind_of? TextNode
+            childNodes[-1].rxobj.value =
+              childNodes[-1].rxobj.to_s + node.rxobj.to_s
+            childNodes[-1].rxobj.raw = true
+        else
+            childNodes.push node
+            rxobj.add node.rxobj
+        end
+        node.parent = self
+    end
+
+    def removeChild node
+       childNodes.delete node
+       rxobj.delete node.rxobj
+       node.parent = nil
+    end
+
+    def insertText data, before=nil
+        if before
+            insertBefore TextNode.new(data), before
+        else
+            appendChild TextNode.new(data)
+        end
+    end
+
+    def insertBefore node, refNode
+        index = childNodes.index(refNode)
+        if node.kind_of? TextNode and index>0 and 
+          childNodes[index-1].kind_of? TextNode
+            childNodes[index-1].rxobj.value =
+              childNodes[index-1].rxobj.to_s + node.rxobj.to_s
+            childNodes[index-1].rxobj.raw = true
+        else
+            childNodes.insert index, node
+        end
+    end
+
+    def hasContent
+        return (childNodes.length > 0)
+    end
+end
+
+class Element < Node
+    def self.rxclass
+        REXML::Element
+    end
+
+    def initialize name
+        super name
+    end
+
+    def cloneNode
+        newNode = self.class.new name
+        attributes.each {|name,value| newNode.attributes[name] = value}
+        newNode
+    end
+
+    def attributes= value
+        value.each {|name,value| rxobj.attributes[name]=value}
+    end
+
+    def printTree indent=0
+        tree = "\n|#{' ' * indent}<#{name}>"
+        indent += 2
+        for name, value in attributes
+            next if name == 'xmlns'
+            tree += "\n|#{' ' * indent}#{name}=\"#{value}\""
+        end
+        for child in childNodes
+            tree += child.printTree(indent)
+        end
+        return tree
+    end
+end
+
+class Document < Node
+    def self.rxclass
+        REXML::Document
+    end
+
+    def initialize
+        super nil
+    end
+
+    def appendChild node
+       if node.kind_of? Element and node.name == 'html'
+           node.rxobj.add_namespace('http://www.w3.org/1999/xhtml')
+       end
+       super node
+    end
+
+    def printTree indent=0
+        tree = "#document"
+        for child in childNodes
+            tree += child.printTree(indent + 2)
+        end
+        return tree
+    end
+end
+
+class DocumentType < Node
+    def self.rxclass
+        REXML::DocType
+    end
+
+    def printTree indent=0
+        "\n|#{' ' * indent}<!DOCTYPE #{name}>"
+    end
+end
+
+class DocumentFragment < Element
+    def initialize
+        super nil
+    end
+
+    def printTree indent=0
+        tree = ""
+        for child in childNodes
+            tree += child.printTree(indent+2)
+        end
+        return tree
+    end
+end
+
+class TextNode < Node
+    def initialize data
+        raw=data.gsub('&','&amp;').gsub('<','&lt;').gsub('>','&gt;')
+        @rxobj = REXML::Text.new(raw, true, nil, true)
+    end
+
+    def printTree indent=0
+        "\n|#{' ' * indent}\"#{rxobj.value}\""
+    end
+end
+
+class CommentNode < Node
+    def self.rxclass
+        REXML::Comment
+    end
+
+    def printTree indent=0
+        "\n|#{' ' * indent}<!-- #{rxobj.string} -->"
+    end
+end
+
+class TreeBuilder < Base::TreeBuilder
+    def initialize
+        @documentClass = Document
+        @doctypeClass = DocumentType
+        @elementClass = Element
+        @commentClass = CommentNode
+        @fragmentClass = DocumentFragment
+    end
+
+    def testSerializer node
+        node.printTree()
+    end
+
+    def getDocument
+        @document.rxobj
+    end
+
+    def getFragment
+        @document = super
+        return @document.rxobj.children
+    end
+end
+
+end
+end
+end
--- a/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/simpletree.rb
+++ b/vendor/plugins/HTML5lib/lib/html5lib/treebuilders/simpletree.rb
@ -0,0 +1,178 @@
+require 'html5lib/treebuilders/base'
+
+module HTML5lib
+module TreeBuilders
+module SimpleTree
+
+class Node < Base::Node
+    # Node representing an item in the tree.
+    # name - The tag name associated with the node
+    attr_accessor :name
+
+    # The value of the current node (applies to text nodes and 
+    # comments
+    attr_accessor :value
+
+    # a dict holding name, value pairs for attributes of the node
+    attr_accessor :attributes
+
+    def initialize name
+        super
+        @name = name
+        @value = nil
+        @attributes = {}
+    end
+
+    def appendChild node
+        if node.kind_of? TextNode and 
+          childNodes.length>0 and childNodes[-1].kind_of? TextNode
+            childNodes[-1].value += node.value
+        else
+            childNodes.push node
+        end
+        node.parent = self
+    end
+
+    def removeChild node
+       childNodes.delete node
+       node.parent = nil
+    end
+
+    def cloneNode
+        newNode = self.class.new name
+        attributes.each {|name,value| newNode.attributes[name] = value}
+        newNode.value = value
+        newNode
+    end
+
+    def insertText data, before=nil
+        if before
+            insertBefore TextNode.new(data), before
+        else
+            appendChild TextNode.new(data)
+        end
+    end
+
+    def insertBefore node, refNode
+        index = childNodes.index(refNode)
+        if node.kind_of? TextNode and index>0 and 
+          childNodes[index-1].kind_of? TextNode
+            childNodes[index-1].value += node.value
+        else
+            childNodes.insert index, node
+        end
+    end
+
+    def printTree indent=0
+        tree = "\n|%s%s" % [' '* indent, self.to_s]
+        for child in childNodes
+            tree += child.printTree(indent + 2)
+        end
+        return tree
+    end
+
+    def hasContent
+        return (childNodes.length > 0)
+    end
+end
+
+class Element < Node
+    def to_s
+       "<%s>" % name
+    end
+
+    def printTree indent=0
+        tree = "\n|%s%s" % [' '* indent, self.to_s]
+        indent += 2
+        for name, value in attributes
+            tree += "\n|%s%s=\"%s\"" % [' ' * indent, name, value]
+        end
+        for child in childNodes
+            tree += child.printTree(indent)
+        end
+        return tree
+    end
+end
+
+class Document < Node
+    def to_s
+       "#document"
+    end
+
+    def initialize
+        super nil
+    end
+
+    def printTree indent=0
+        tree = to_s
+        for child in childNodes
+            tree += child.printTree(indent + 2)
+        end
+        return tree
+    end
+end
+
+class DocumentType < Node
+    def to_s
+       "<!DOCTYPE %s>" % name
+    end
+end
+
+class DocumentFragment < Element
+    def initialize
+        super nil
+    end
+
+    def printTree indent=0
+        tree = ""
+        for child in childNodes
+            tree += child.printTree(indent+2)
+        end
+        return tree
+    end
+end
+
+class TextNode < Node
+    def initialize value
+        super nil
+        @value = value
+    end
+
+    def to_s
+       '"%s"' % value
+    end
+end
+
+class CommentNode < Node
+    def initialize value
+        super nil
+        @value = value
+    end
+
+    def to_s
+        "<!-- %s -->" % value
+    end
+end
+
+class TreeBuilder < Base::TreeBuilder
+    def initialize
+        @documentClass = Document
+        @doctypeClass = DocumentType
+        @elementClass = Element
+        @commentClass = CommentNode
+        @fragmentClass = DocumentFragment
+    end
+
+    def testSerializer node
+        node.printTree()
+    end
+
+    def getFragment
+        @document = super
+        return @document.childNodes
+    end
+end
+
+end
+end
+end