instiki/lib/maruku/input/parse_block.rb

#--
#   Copyright (C) 2006  Andrea Censi  <andrea (at) rubyforge.org>
#
# This file is part of Maruku.
#
#   Maruku is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#   (at your option) any later version.
#
#   Maruku is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with Maruku; if not, write to the Free Software
#   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
#++


module MaRuKu; module In; module Markdown; module BlockLevelParser

	include Helpers
	include MaRuKu::Strings
	include MaRuKu::In::Markdown::SpanLevelParser

	class BlockContext < Array
		def describe
			n = 5
			desc = size > n ? self[-n,n] : self
			"Last #{n} elements: "+
			desc.map{|x| "\n -" + x.inspect}.join
		end
	end

	# Splits the string and calls parse_lines_as_markdown
	def parse_text_as_markdown(text)
		lines =  split_lines(text)
		src = LineSource.new(lines)
		return parse_blocks(src)
	end

	# Input is a LineSource
	def parse_blocks(src)
		output = BlockContext.new

		# run state machine
		while src.cur_line

			next if check_block_extensions(src, output, src.cur_line)

#  Prints detected type (useful for debugging)
#			puts "#{src.cur_line.md_type}|#{src.cur_line}"
			case src.cur_line.md_type
				when :empty;
					output.push :empty
					src.ignore_line
				when :ial
					m =  InlineAttributeList.match src.shift_line
					content = m[1] ||  ""
					src2 = CharSource.new(content, src)
					interpret_extension(src2, output, [nil])
				when :ald
					output.push read_ald(src)
				when :text
					if src.cur_line =~ MightBeTableHeader and
						(src.next_line && src.next_line =~ TableSeparator)
						output.push read_table(src)
					elsif [:header1,:header2].include? src.next_line.md_type
						output.push read_header12(src)
					elsif eventually_comes_a_def_list(src)
					 	definition = read_definition(src)
						if output.last.kind_of?(MDElement) &&
							output.last.node_type == :definition_list then
							output.last.children << definition
						else
							output.push md_el(:definition_list, [definition])
						end
					else # Start of a paragraph
						output.push read_paragraph(src)
					end
				when :header2, :hrule
					# hrule
					src.shift_line
					output.push md_hrule()
				when :header3
					output.push read_header3(src)
				when :ulist, :olist
					list_type = src.cur_line.md_type == :ulist ? :ul : :ol
					li = read_list_item(src)
					# append to current list if we have one
					if output.last.kind_of?(MDElement) &&
						output.last.node_type == list_type then
						output.last.children << li
					else
						output.push md_el(list_type, [li])
					end
				when :quote;    output.push read_quote(src)
				when :code;     e = read_code(src); output << e if e
				when :raw_html; e = read_raw_html(src); output << e if e

				when :footnote_text;   output.push read_footnote_text(src)
				when :ref_definition;  output.push read_ref_definition(src)
				when :abbreviation;    output.push read_abbreviation(src)
				when :xml_instr;       read_xml_instruction(src, output)
				when :metadata;
					maruku_error "Please use the new meta-data syntax: \n"+
					"  http://maruku.rubyforge.org/proposal.html\n", src
					src.ignore_line
				else # warn if we forgot something
					md_type = src.cur_line.md_type
					line = src.cur_line
					maruku_error "Ignoring line '#{line}' type = #{md_type}", src
					src.shift_line
			end
		end

		merge_ial(output, src, output)
		output.delete_if {|x| x.kind_of?(MDElement) &&
			x.node_type == :ial}

		# get rid of empty line markers
		output.delete_if {|x| x == :empty}
		# See for each list if we can omit the paragraphs and use li_span
		# TODO: do this after
		output.each do |c|
			# Remove paragraphs that we can get rid of
			if [:ul,:ol].include? c.node_type
				if c.children.all? {|li| !li.want_my_paragraph} then
					c.children.each do |d|
						d.node_type = :li_span
						d.children = d.children[0].children
					end
				end
			end
			if c.node_type == :definition_list
				if c.children.all?{|defi| !defi.want_my_paragraph} then
					c.children.each do |definition|
						definition.definitions.each do |dd|
							dd.children = dd.children[0].children
						end
					end
				end
			end
		end

		output
	end


	def read_ald(src)
		if (l=src.shift_line) =~ AttributeDefinitionList
			id = $1;   al=$2;
			al = read_attribute_list(CharSource.new(al,src), context=nil, break_on=[nil])
			self.ald[id] = al;
			return md_ald(id, al)
		else
			maruku_error "Bug Bug:\n#{l.inspect}"
			return nil
		end
	end

	# reads a header (with ----- or ========)
	def read_header12(src)
		line = src.shift_line.strip
		al = nil
		# Check if there is an IAL
		if new_meta_data? and line =~ /^(.*)\{(.*)\}\s*$/
			line = $1.strip
			ial = $2
			al  = read_attribute_list(CharSource.new(ial,src), context=nil, break_on=[nil])
		end
		text = parse_lines_as_span [ line ]
		level = src.cur_line.md_type == :header2 ? 2 : 1;
		src.shift_line
		return md_header(level, text, al)
	end

	# reads a header like '#### header ####'
	def read_header3(src)
		line = src.shift_line.strip
		al = nil
		# Check if there is an IAL
		if new_meta_data? and line =~ /^(.*)\{(.*)\}\s*$/
			line = $1.strip
			ial = $2
			al  = read_attribute_list(CharSource.new(ial,src), context=nil, break_on=[nil])
		end
		level = num_leading_hashes(line)
		text = parse_lines_as_span [strip_hashes(line)]
		return md_header(level, text, al)
	end

	def read_xml_instruction(src, output)
		m = /^\s*<\?((\w+)\s*)?(.*)$/.match src.shift_line
		raise "BugBug" if not m
		target = m[2] || ''
		code = m[3]
		until code =~ /\?>/
			code += "\n"+src.shift_line
		end
		if not code =~ (/\?>\s*$/)
			garbage = (/\?>(.*)$/.match(code))[1]
			maruku_error "Trailing garbage on last line: #{garbage.inspect}:\n"+
				add_tabs(code, 1, '|'), src
		end
		code.gsub!(/\?>\s*$/, '')

		if target == 'mrk' && MaRuKu::Globals[:unsafe_features]
			result = safe_execute_code(self, code)
			if result
				if result.kind_of? String
					raise "Not expected"
				else
					output.push *result
				end
			end
		else
			output.push md_xml_instr(target, code)
		end
	end

	def read_raw_html(src)
		h = HTMLHelper.new
		begin
			h.eat_this(l=src.shift_line)
#			puts "\nBLOCK:\nhtml -> #{l.inspect}"
			while src.cur_line and not h.is_finished?
				l=src.shift_line
#				puts "html -> #{l.inspect}"
				h.eat_this "\n"+l
			end
		rescue Exception => e
			ex = e.inspect + e.backtrace.join("\n")
			maruku_error "Bad block-level HTML:\n#{add_tabs(ex,1,'|')}\n", src
		end
		raw_html = h.stuff_you_read
		return md_html(raw_html)
	end

	def read_paragraph(src)
		lines = []
		while src.cur_line
			# :olist does not break
			case t = src.cur_line.md_type
				when :quote,:header3,:empty,:raw_html,:ref_definition,:ial,:xml_instr
					break
				when :olist,:ulist
					break if src.next_line.md_type == t
			end
			break if src.cur_line.strip.size == 0
			break if [:header1,:header2].include? src.next_line.md_type
			break if any_matching_block_extension?(src.cur_line)

			lines << src.shift_line
		end
#		dbg_describe_ary(lines, 'PAR')
		children = parse_lines_as_span(lines, src)

		return md_par(children)
	end

	# Reads one list item, either ordered or unordered.
	def read_list_item(src)
		parent_offset = src.cur_index

		item_type = src.cur_line.md_type
		first = src.shift_line

		# Ugly things going on inside `read_indented_content`
		indentation = spaces_before_first_char(first)
		break_list = [:ulist, :olist, :ial]
		lines, want_my_paragraph =
			read_indented_content(src,indentation, break_list, item_type)

		# add first line
			# Strip first '*', '-', '+' from first line
			stripped = first[indentation, first.size-1]
		lines.unshift stripped

		#dbg_describe_ary(lines, 'LIST ITEM ')

		src2 = LineSource.new(lines, src, parent_offset)
		children = parse_blocks(src2)
		with_par = want_my_paragraph || (children.size>1)

		return md_li(children, with_par)
	end

	def read_abbreviation(src)
		if not (l=src.shift_line) =~ Abbreviation
			maruku_error "Bug: it's Andrea's fault. Tell him.\n#{l.inspect}"
		end

		abbr = $1
		desc = $2

		if (not abbr) or (abbr.size==0)
			maruku_error "Bad abbrev. abbr=#{abbr.inspect} desc=#{desc.inspect}"
		end

		self.abbreviations[abbr] = desc

		return md_abbr_def(abbr, desc)
	end

	def read_footnote_text(src)
		parent_offset = src.cur_index

		first = src.shift_line

		if not first =~ FootnoteText
			maruku_error "Bug (it's Andrea's fault)"
		end

		id = $1
		text = $2

		# Ugly things going on inside `read_indented_content`
		indentation = 4 #first.size-text.size

#		puts "id =_#{id}_; text=_#{text}_ indent=#{indentation}"

		break_list = [:footnote_text]
		item_type = :footnote_text
		lines, want_my_paragraph =
			read_indented_content(src,indentation, break_list, item_type)

		# add first line
		if text && text.strip != "" then lines.unshift text end

#		dbg_describe_ary(lines, 'FOOTNOTE')
		src2 = LineSource.new(lines, src, parent_offset)
		children = parse_blocks(src2)

		e = md_footnote(id, children)
		self.footnotes[id] = e
		return e
	end


	# This is the only ugly function in the code base.
	# It is used to read list items, descriptions, footnote text
	def read_indented_content(src, indentation, break_list, item_type)
		lines =[]
		# collect all indented lines
		saw_empty = false; saw_anything_after = false
		while src.cur_line
			#puts "#{src.cur_line.md_type} #{src.cur_line.inspect}"
			if src.cur_line.md_type == :empty
				saw_empty = true
				lines << src.shift_line
				next
			end

			# after a white line
			if saw_empty
				# we expect things to be properly aligned
				if (ns=number_of_leading_spaces(src.cur_line)) < indentation
					#puts "breaking for spaces, only #{ns}: #{src.cur_line}"
					break
				end
				saw_anything_after = true
			else
				break if break_list.include? src.cur_line.md_type
#				break if src.cur_line.md_type != :text
			end


			stripped = strip_indent(src.shift_line, indentation)
			lines << stripped

			#puts "Accepted as #{stripped.inspect}"

			# You are only required to indent the first line of
			# a child paragraph.
			if stripped.md_type == :text
				while src.cur_line && (src.cur_line.md_type == :text)
					lines << strip_indent(src.shift_line, indentation)
				end
			end
		end

		want_my_paragraph = saw_anything_after ||
			(saw_empty && (src.cur_line  && (src.cur_line.md_type == item_type)))

#		dbg_describe_ary(lines, 'LI')
		# create a new context

		while lines.last && (lines.last.md_type == :empty)
			lines.pop
		end

		return lines, want_my_paragraph
	end


	def read_quote(src)
		parent_offset = src.cur_index

		lines = []
		# collect all indented lines
		while src.cur_line && src.cur_line.md_type == :quote
			lines << unquote(src.shift_line)
		end
#		dbg_describe_ary(lines, 'QUOTE')

		src2 = LineSource.new(lines, src, parent_offset)
		children = parse_blocks(src2)
		return md_quote(children)
	end

	def read_code(src)
		# collect all indented lines
		lines = []
		while src.cur_line && ([:code, :empty].include? src.cur_line.md_type)
			lines << strip_indent(src.shift_line, 4)
		end

		#while lines.last && (lines.last.md_type == :empty )
		while lines.last && lines.last.strip.size == 0
			lines.pop
		end

		while lines.first && lines.first.strip.size == 0
			lines.shift
		end

		return nil if lines.empty?

		source = lines.join("\n")

#		dbg_describe_ary(lines, 'CODE')

		return md_codeblock(source)
	end

	# Reads a series of metadata lines with empty lines in between
	def read_metadata(src)
		hash = {}
		while src.cur_line
			case src.cur_line.md_type
				when :empty;  src.shift_line
				when :metadata; hash.merge! parse_metadata(src.shift_line)
				else break
			end
		end
		hash
	end


	def read_ref_definition(src)
		line = src.shift_line

		# if link is incomplete, shift next line
		if src.cur_line && (src.cur_line.md_type != :ref_definition) &&
			([1,2,3].include? number_of_leading_spaces(src.cur_line) )
			line += " "+ src.shift_line
		end

#		puts "total= #{line}"

		match = LinkRegex.match(line)
		if not match
			error "Link does not respect format: '#{line}'"
		end

		id = match[1]; url = match[2]; title = match[3];
		id = id.strip.downcase

		hash = self.refs[id] = {:url=>url,:title=>title}

		stuff=match[4]

		if stuff
			stuff.split.each do |couple|
#					puts "found #{couple}"
				k, v = couple.split('=')
				v ||= ""
				if v[0,1]=='"' then v = v[1, v.size-2] end
#					puts "key:_#{k}_ value=_#{v}_"
				hash[k.to_sym] = v
			end
		end
#			puts hash.inspect

		return md_ref_def(id, url, meta={:title=>title})
	end

	def read_table(src)

		def split_cells(s)
			s.strip.split('|').select{|x|x.strip.size>0}.map{|x|x.strip}
		end

		head = split_cells(src.shift_line).map{|s| md_el(:head_cell, parse_lines_as_span([s])) }

		separator=split_cells(src.shift_line)

		align = separator.map { |s|  s =~ Sep
			if $1 and $2 then :center elsif $2 then :right else :left end }

		num_columns = align.size

		if head.size != num_columns
			maruku_error "Table head does not have #{num_columns} columns: \n#{head.inspect}"
			tell_user "I will ignore this table."
			# XXX try to recover
			return md_br()
		end

		rows = []

		while src.cur_line && src.cur_line =~ /\|/
			row = split_cells(src.shift_line).map{|s|
				md_el(:cell, parse_lines_as_span([s]))}
			if head.size != num_columns
				maruku_error  "Row does not have #{num_columns} columns: \n#{row.inspect}"
				tell_user "I will ignore this table."
				# XXX try to recover
				return md_br()
			end
			rows << row
		end

		children = (head+rows).flatten
		return md_el(:table, children, {:align => align})
	end

	# If current line is text, a definition list is coming
	# if 1) text,empty,[text,empty]*,definition

	def eventually_comes_a_def_list(src)
		future = src.tell_me_the_future
		ok = future =~ %r{^t+e?d}x
#		puts "future: #{future} - #{ok}"
		ok
	end


	def read_definition(src)
		# Read one or more terms
		terms = []
		while  src.cur_line &&  src.cur_line.md_type == :text
			terms << md_el(:definition_term, parse_lines_as_span([src.shift_line]))
		end
#		dbg_describe_ary(terms, 'DT')

		want_my_paragraph = false

		raise "Chunky Bacon!" if not src.cur_line

		# one optional empty
		if src.cur_line.md_type == :empty
			want_my_paragraph = true
			src.shift_line
		end

		raise "Chunky Bacon!" if src.cur_line.md_type != :definition

		# Read one or more definitions
		definitions = []
		while src.cur_line && src.cur_line.md_type == :definition
			parent_offset = src.cur_index

			first = src.shift_line
			first =~ Definition
			first = $1

			# I know, it's ugly!!!

			lines, w_m_p =
				read_indented_content(src,4, [:definition], :definition)
			want_my_paragraph ||= w_m_p

			lines.unshift first

#			dbg_describe_ary(lines, 'DD')
			src2 = LineSource.new(lines, src, parent_offset)
			children = parse_blocks(src2)
			definitions << md_el(:definition_data, children)
		end

		return md_el(:definition, terms+definitions, {
			:terms => terms,
			:definitions => definitions,
			:want_my_paragraph => want_my_paragraph})
	end
end # BlockLevelParser
end # MaRuKu
end
end