(.*)<\/div>\Z/m, '\1') rescue REXML::ParseException sanitized = sanitized.escapeHTML end # Sanitize a string, parsed using HTML parsing rules. # # :call-seq: # sanitize_html( string ) -> string # sanitize_html( string, {:encoding => 'iso-8859-1', :to_tree => true} ) -> REXML::Document # # Unless otherwise specified, the string is assumed to be utf-8 encoded. # By default, the output is a string. But, optionally, you can return a REXML tree. # # The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding. # (REXML trees are always utf-8 encoded.) def sanitize_html(html, options = {}) @encoding = 'utf-8' @treebuilder = TreeBuilders::REXML::TreeBuilder @to_tree = false options.each do |name, value| next unless %w(encoding treebuilder to_tree).include? name.to_s if name.to_s == 'treebuilder' @treebuilder = HTML5lib::TreeBuilders.get_tree_builder(value) else instance_variable_set("@#{name}", value) end end if @encoding == 'utf-8' parsed = HTMLParser.parse_fragment(html.to_utf8, {:tokenizer => HTMLSanitizer, :encoding => @encoding, :tree => @treebuilder }) else parsed = HTMLParser.parse_fragment(html.to_ncr, {:tokenizer => HTMLSanitizer, :encoding => @encoding, :tree => @treebuilder }) end return parsed if @to_tree return parsed.to_s end # Sanitize a REXML tree. The output is a string. # # :call-seq: # sanitize_rexml(tree) -> string # def sanitize_rexml(tree) tokens = TreeWalkers.get_tree_walker('rexml2').new(tree) XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8', :space_before_trailing_solidus => true, :inject_meta_charset => false, :sanitize => true}) end end require 'rexml/element' module REXML #:nodoc: class Element # Convert XHTML+MathML Named Entities in a REXML::Element to Numeric Character References # # :call-seq: # tree.to_ncr -> REXML::Element # # REXML, typically, converts NCRs to utf-8 characters, which is what you'll see when you # access the resulting REXML document. # # Note that this method needs to traverse the entire tree, converting text nodes and attributes # for each element. This can be SLOW. It will often be faster to serialize to a string and then # use String.to_ncr instead. # def to_ncr self.each_element { |el| el.texts.each_index {|i| el.texts[i].value = el.texts[i].to_s.to_ncr } el.attributes.each { |name,val| el.attributes[name] = val.to_ncr } el.to_ncr if el.has_elements? } return self end # Convert XHTML+MathML Named Entities in a REXML::Element to UTF-8 # # :call-seq: # tree.to_utf8 -> REXML::Element # # Note that this method needs to traverse the entire tree, converting text nodes and attributes # for each element. This can be SLOW. It will often be faster to serialize to a string and then # use String.to_utf8 instead. # def to_utf8 self.each_element { |el| el.texts.each_index {|i| el.texts[i].value = el.texts[i].to_s.to_utf8 } el.attributes.each { |name,val| el.attributes[name] = val.to_utf8 } el.to_utf8 if el.has_elements? } return self end end end module HTML5 #:nodoc: all module TreeWalkers private class << self def [](name) case name.to_s.downcase when 'rexml' require 'html5/treewalkers/rexml' REXML::TreeWalker when 'rexml2' REXML2::TreeWalker else raise "Unknown TreeWalker #{name}" end end alias :get_tree_walker :[] end module REXML2 class TreeWalker < HTML5::TreeWalkers::NonRecursiveTreeWalker private def node_details(node) case node when ::REXML::Document [:DOCUMENT] when ::REXML::Element if !node.name [:DOCUMENT_FRAGMENT] else [:ELEMENT, node.name, node.attributes.map {|name,value| [name,value.to_utf8]}, node.has_elements? || node.has_text?] end when ::REXML::Text [:TEXT, node.value.to_utf8] when ::REXML::Comment [:COMMENT, node.string] when ::REXML::DocType [:DOCTYPE, node.name, node.public, node.system] when ::REXML::XMLDecl [nil] else [:UNKNOWN, node.class.inspect] end end def first_child(node) node.children.first end def next_sibling(node) node.next_sibling end def parent(node) node.parent end end end end end