2008-01-12 06:53:29 +01:00
|
|
|
require 'rexml/entity'
|
|
|
|
require 'rexml/doctype'
|
|
|
|
require 'rexml/child'
|
|
|
|
require 'rexml/doctype'
|
|
|
|
require 'rexml/parseexception'
|
|
|
|
|
|
|
|
module REXML
|
|
|
|
# Represents text nodes in an XML document
|
|
|
|
class Text < Child
|
|
|
|
include Comparable
|
|
|
|
# The order in which the substitutions occur
|
|
|
|
SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
|
|
|
|
SUBSTITUTES = ['&', '<', '>', '"', ''', ' ']
|
|
|
|
# Characters which are substituted in written strings
|
|
|
|
SLAICEPS = [ '<', '>', '"', "'", '&' ]
|
|
|
|
SETUTITSBUS = [ /</u, />/u, /"/u, /'/u, /&/u ]
|
|
|
|
|
|
|
|
# If +raw+ is true, then REXML leaves the value alone
|
|
|
|
attr_accessor :raw
|
|
|
|
|
|
|
|
NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
|
|
|
|
NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/
|
|
|
|
VALID_CHAR = [
|
|
|
|
0x9, 0xA, 0xD,
|
|
|
|
(0x20..0xD7FF),
|
|
|
|
(0xE000..0xFFFD),
|
|
|
|
(0x10000..0x10FFFF)
|
|
|
|
]
|
|
|
|
|
|
|
|
if String.method_defined? :encode
|
|
|
|
VALID_XML_CHARS = Regexp.new('^['+
|
|
|
|
VALID_CHAR.map { |item|
|
|
|
|
case item
|
|
|
|
when Fixnum
|
|
|
|
[item].pack('U').force_encoding('utf-8')
|
|
|
|
when Range
|
|
|
|
[item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
|
|
|
|
end
|
|
|
|
}.join +
|
|
|
|
']*$')
|
|
|
|
else
|
|
|
|
VALID_XML_CHARS = /^(
|
|
|
|
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
|
|
|
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
|
|
|
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
|
|
|
| [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte
|
|
|
|
| \xEF[\x80-\xBE]{2} #
|
|
|
|
| \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff
|
|
|
|
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
|
|
|
|
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
|
|
|
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
|
|
|
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
2008-04-13 01:56:02 +02:00
|
|
|
)*$/nx;
|
2008-01-12 06:53:29 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
# Constructor
|
|
|
|
# +arg+ if a String, the content is set to the String. If a Text,
|
|
|
|
# the object is shallowly cloned.
|
|
|
|
#
|
|
|
|
# +respect_whitespace+ (boolean, false) if true, whitespace is
|
|
|
|
# respected
|
|
|
|
#
|
|
|
|
# +parent+ (nil) if this is a Parent object, the parent
|
|
|
|
# will be set to this.
|
|
|
|
#
|
|
|
|
# +raw+ (nil) This argument can be given three values.
|
|
|
|
# If true, then the value of used to construct this object is expected to
|
|
|
|
# contain no unescaped XML markup, and REXML will not change the text. If
|
|
|
|
# this value is false, the string may contain any characters, and REXML will
|
|
|
|
# escape any and all defined entities whose values are contained in the
|
|
|
|
# text. If this value is nil (the default), then the raw value of the
|
|
|
|
# parent will be used as the raw value for this node. If there is no raw
|
|
|
|
# value for the parent, and no value is supplied, the default is false.
|
|
|
|
# Use this field if you have entities defined for some text, and you don't
|
|
|
|
# want REXML to escape that text in output.
|
|
|
|
# Text.new( "<&", false, nil, false ) #-> "<&"
|
|
|
|
# Text.new( "<&", false, nil, false ) #-> "&lt;&amp;"
|
|
|
|
# Text.new( "<&", false, nil, true ) #-> Parse exception
|
|
|
|
# Text.new( "<&", false, nil, true ) #-> "<&"
|
|
|
|
# # Assume that the entity "s" is defined to be "sean"
|
|
|
|
# # and that the entity "r" is defined to be "russell"
|
|
|
|
# Text.new( "sean russell" ) #-> "&s; &r;"
|
|
|
|
# Text.new( "sean russell", false, nil, true ) #-> "sean russell"
|
|
|
|
#
|
|
|
|
# +entity_filter+ (nil) This can be an array of entities to match in the
|
|
|
|
# supplied text. This argument is only useful if +raw+ is set to false.
|
|
|
|
# Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
|
|
|
|
# Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
|
|
|
|
# In the last example, the +entity_filter+ argument is ignored.
|
|
|
|
#
|
|
|
|
# +pattern+ INTERNAL USE ONLY
|
|
|
|
def initialize(arg, respect_whitespace=false, parent=nil, raw=nil,
|
|
|
|
entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK )
|
|
|
|
|
|
|
|
@raw = false
|
|
|
|
|
|
|
|
if parent
|
|
|
|
super( parent )
|
|
|
|
@raw = parent.raw
|
|
|
|
else
|
|
|
|
@parent = nil
|
|
|
|
end
|
|
|
|
|
|
|
|
@raw = raw unless raw.nil?
|
|
|
|
@entity_filter = entity_filter
|
|
|
|
@normalized = @unnormalized = nil
|
|
|
|
|
|
|
|
if arg.kind_of? String
|
|
|
|
@string = arg.clone
|
|
|
|
@string.squeeze!(" \n\t") unless respect_whitespace
|
|
|
|
elsif arg.kind_of? Text
|
|
|
|
@string = arg.to_s
|
|
|
|
@raw = arg.raw
|
|
|
|
elsif
|
|
|
|
raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})"
|
|
|
|
end
|
|
|
|
|
|
|
|
@string.gsub!( /\r\n?/, "\n" )
|
|
|
|
|
|
|
|
Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
|
|
|
|
end
|
|
|
|
|
|
|
|
def parent= parent
|
|
|
|
super(parent)
|
|
|
|
Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
|
|
|
|
end
|
|
|
|
|
|
|
|
# check for illegal characters
|
|
|
|
def Text.check string, pattern, doctype
|
|
|
|
|
|
|
|
# illegal anywhere
|
|
|
|
if string !~ VALID_XML_CHARS
|
|
|
|
if String.method_defined? :encode
|
|
|
|
string.chars.each do |c|
|
|
|
|
case c.ord
|
|
|
|
when *VALID_CHAR
|
|
|
|
else
|
|
|
|
raise "Illegal character #{c.inspect} in raw string \"#{string}\""
|
|
|
|
end
|
|
|
|
end
|
|
|
|
else
|
2008-04-13 01:56:02 +02:00
|
|
|
string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c|
|
2008-01-12 06:53:29 +01:00
|
|
|
case c.unpack('U')
|
|
|
|
when *VALID_CHAR
|
|
|
|
else
|
|
|
|
raise "Illegal character #{c.inspect} in raw string \"#{string}\""
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# context sensitive
|
|
|
|
string.scan(pattern).each do
|
|
|
|
if $1[-1] != ?;
|
|
|
|
raise "Illegal character '#{$1}' in raw string \"#{string}\""
|
|
|
|
elsif $1[0] == ?&
|
|
|
|
if $5 and $5[0] == ?#
|
|
|
|
case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
|
|
|
|
when *VALID_CHAR
|
|
|
|
else
|
|
|
|
raise "Illegal character '#{$1}' in raw string \"#{string}\""
|
|
|
|
end
|
|
|
|
elsif $3 and !SUBSTITUTES.include?($1)
|
|
|
|
if !doctype or !doctype.entities.has_key?($3)
|
|
|
|
raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def node_type
|
|
|
|
:text
|
|
|
|
end
|
|
|
|
|
|
|
|
def empty?
|
|
|
|
@string.size==0
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
def clone
|
|
|
|
return Text.new(self)
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
# Appends text to this text node. The text is appended in the +raw+ mode
|
|
|
|
# of this text node.
|
|
|
|
def <<( to_append )
|
|
|
|
@string << to_append.gsub( /\r\n?/, "\n" )
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
# +other+ a String or a Text
|
|
|
|
# +returns+ the result of (to_s <=> arg.to_s)
|
|
|
|
def <=>( other )
|
|
|
|
to_s() <=> other.to_s
|
|
|
|
end
|
|
|
|
|
|
|
|
def doctype
|
|
|
|
if @parent
|
|
|
|
doc = @parent.document
|
|
|
|
doc.doctype if doc
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
REFERENCE = /#{Entity::REFERENCE}/
|
|
|
|
# Returns the string value of this text node. This string is always
|
|
|
|
# escaped, meaning that it is a valid XML text node string, and all
|
|
|
|
# entities that can be escaped, have been inserted. This method respects
|
|
|
|
# the entity filter set in the constructor.
|
|
|
|
#
|
|
|
|
# # Assume that the entity "s" is defined to be "sean", and that the
|
|
|
|
# # entity "r" is defined to be "russell"
|
|
|
|
# t = Text.new( "< & sean russell", false, nil, false, ['s'] )
|
|
|
|
# t.to_s #-> "< & &s; russell"
|
|
|
|
# t = Text.new( "< & &s; russell", false, nil, false )
|
|
|
|
# t.to_s #-> "< & &s; russell"
|
|
|
|
# u = Text.new( "sean russell", false, nil, true )
|
|
|
|
# u.to_s #-> "sean russell"
|
|
|
|
def to_s
|
|
|
|
return @string if @raw
|
|
|
|
return @normalized if @normalized
|
|
|
|
|
|
|
|
@normalized = Text::normalize( @string, doctype, @entity_filter )
|
|
|
|
end
|
|
|
|
|
|
|
|
def inspect
|
|
|
|
@string.inspect
|
|
|
|
end
|
|
|
|
|
|
|
|
# Returns the string value of this text. This is the text without
|
|
|
|
# entities, as it might be used programmatically, or printed to the
|
|
|
|
# console. This ignores the 'raw' attribute setting, and any
|
|
|
|
# entity_filter.
|
|
|
|
#
|
|
|
|
# # Assume that the entity "s" is defined to be "sean", and that the
|
|
|
|
# # entity "r" is defined to be "russell"
|
|
|
|
# t = Text.new( "< & sean russell", false, nil, false, ['s'] )
|
|
|
|
# t.value #-> "< & sean russell"
|
|
|
|
# t = Text.new( "< & &s; russell", false, nil, false )
|
|
|
|
# t.value #-> "< & sean russell"
|
|
|
|
# u = Text.new( "sean russell", false, nil, true )
|
|
|
|
# u.value #-> "sean russell"
|
|
|
|
def value
|
|
|
|
return @unnormalized if @unnormalized
|
|
|
|
@unnormalized = Text::unnormalize( @string, doctype )
|
|
|
|
end
|
|
|
|
|
|
|
|
# Sets the contents of this text node. This expects the text to be
|
|
|
|
# unnormalized. It returns self.
|
|
|
|
#
|
|
|
|
# e = Element.new( "a" )
|
|
|
|
# e.add_text( "foo" ) # <a>foo</a>
|
|
|
|
# e[0].value = "bar" # <a>bar</a>
|
|
|
|
# e[0].value = "<a>" # <a><a></a>
|
|
|
|
def value=( val )
|
|
|
|
@string = val.gsub( /\r\n?/, "\n" )
|
|
|
|
@unnormalized = nil
|
|
|
|
@normalized = nil
|
|
|
|
@raw = false
|
|
|
|
end
|
|
|
|
|
|
|
|
def wrap(string, width, addnewline=false)
|
|
|
|
# Recursivly wrap string at width.
|
|
|
|
return string if string.length <= width
|
|
|
|
place = string.rindex(' ', width) # Position in string with last ' ' before cutoff
|
|
|
|
if addnewline then
|
|
|
|
return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
|
|
|
|
else
|
|
|
|
return string[0,place] + "\n" + wrap(string[place+1..-1], width)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def indent_text(string, level=1, style="\t", indentfirstline=true)
|
|
|
|
return string if level < 0
|
|
|
|
new_string = ''
|
|
|
|
string.each { |line|
|
|
|
|
indent_string = style * level
|
|
|
|
new_line = (indent_string + line).sub(/[\s]+$/,'')
|
|
|
|
new_string << new_line
|
|
|
|
}
|
|
|
|
new_string.strip! unless indentfirstline
|
|
|
|
return new_string
|
|
|
|
end
|
|
|
|
|
|
|
|
# == DEPRECATED
|
|
|
|
# See REXML::Formatters
|
|
|
|
#
|
|
|
|
def write( writer, indent=-1, transitive=false, ie_hack=false )
|
|
|
|
Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters")
|
|
|
|
formatter = if indent > -1
|
|
|
|
REXML::Formatters::Pretty.new( indent )
|
|
|
|
else
|
|
|
|
REXML::Formatters::Default.new
|
|
|
|
end
|
|
|
|
formatter.write( self, writer )
|
|
|
|
end
|
|
|
|
|
|
|
|
# FIXME
|
|
|
|
# This probably won't work properly
|
|
|
|
def xpath
|
|
|
|
path = @parent.xpath
|
|
|
|
path += "/text()"
|
|
|
|
return path
|
|
|
|
end
|
|
|
|
|
|
|
|
# Writes out text, substituting special characters beforehand.
|
|
|
|
# +out+ A String, IO, or any other object supporting <<( String )
|
|
|
|
# +input+ the text to substitute and the write out
|
|
|
|
#
|
|
|
|
# z=utf8.unpack("U*")
|
|
|
|
# ascOut=""
|
|
|
|
# z.each{|r|
|
|
|
|
# if r < 0x100
|
|
|
|
# ascOut.concat(r.chr)
|
|
|
|
# else
|
|
|
|
# ascOut.concat(sprintf("&#x%x;", r))
|
|
|
|
# end
|
|
|
|
# }
|
|
|
|
# puts ascOut
|
|
|
|
def write_with_substitution out, input
|
|
|
|
copy = input.clone
|
|
|
|
# Doing it like this rather than in a loop improves the speed
|
|
|
|
copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
|
|
|
|
copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
|
|
|
|
copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
|
|
|
|
copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
|
|
|
|
copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
|
|
|
|
copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
|
|
|
|
out << copy
|
|
|
|
end
|
|
|
|
|
|
|
|
# Reads text, substituting entities
|
|
|
|
def Text::read_with_substitution( input, illegal=nil )
|
|
|
|
copy = input.clone
|
|
|
|
|
|
|
|
if copy =~ illegal
|
|
|
|
raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
|
|
|
|
end if illegal
|
|
|
|
|
|
|
|
copy.gsub!( /\r\n?/, "\n" )
|
|
|
|
if copy.include? ?&
|
|
|
|
copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
|
|
|
|
copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
|
|
|
|
copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
|
|
|
|
copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
|
|
|
|
copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
|
2008-04-13 01:56:02 +02:00
|
|
|
copy.gsub!( /�*((?:\d+)|(?:x[a-f0-9]+));/ ) {
|
2008-01-12 06:53:29 +01:00
|
|
|
m=$1
|
|
|
|
#m='0' if m==''
|
|
|
|
m = "0#{m}" if m[0] == ?x
|
|
|
|
[Integer(m)].pack('U*')
|
|
|
|
}
|
|
|
|
end
|
|
|
|
copy
|
|
|
|
end
|
|
|
|
|
|
|
|
EREFERENCE = /&(?!#{Entity::NAME};)/
|
|
|
|
# Escapes all possible entities
|
|
|
|
def Text::normalize( input, doctype=nil, entity_filter=nil )
|
|
|
|
copy = input.to_s
|
|
|
|
# Doing it like this rather than in a loop improves the speed
|
|
|
|
#copy = copy.gsub( EREFERENCE, '&' )
|
|
|
|
copy = copy.gsub( "&", "&" )
|
|
|
|
if doctype
|
|
|
|
# Replace all ampersands that aren't part of an entity
|
|
|
|
doctype.entities.each_value do |entity|
|
|
|
|
copy = copy.gsub( entity.value,
|
|
|
|
"&#{entity.name};" ) if entity.value and
|
|
|
|
not( entity_filter and entity_filter.include?(entity) )
|
|
|
|
end
|
|
|
|
else
|
|
|
|
# Replace all ampersands that aren't part of an entity
|
|
|
|
DocType::DEFAULT_ENTITIES.each_value do |entity|
|
|
|
|
copy = copy.gsub(entity.value, "&#{entity.name};" )
|
|
|
|
end
|
|
|
|
end
|
|
|
|
copy
|
|
|
|
end
|
|
|
|
|
|
|
|
# Unescapes all possible entities
|
|
|
|
def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
|
2008-04-13 01:56:02 +02:00
|
|
|
string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
|
|
|
|
ref = $&
|
2008-01-12 06:53:29 +01:00
|
|
|
if ref[1] == ?#
|
|
|
|
if ref[2] == ?x
|
|
|
|
[ref[3...-1].to_i(16)].pack('U*')
|
|
|
|
else
|
|
|
|
[ref[2...-1].to_i].pack('U*')
|
|
|
|
end
|
|
|
|
elsif ref == '&'
|
|
|
|
'&'
|
|
|
|
elsif filter and filter.include?( ref[1...-1] )
|
|
|
|
ref
|
|
|
|
elsif doctype
|
|
|
|
doctype.entity( ref[1...-1] ) or ref
|
|
|
|
else
|
|
|
|
entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ]
|
|
|
|
entity_value ? entity_value.value : ref
|
|
|
|
end
|
|
|
|
}
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|