Bring up to current.

2007-01-22 08:36:51 -06:00 · 2007-01-22 08:36:51 -06:00 · b19e1e4f47
commit b19e1e4f47
parent 69b62b6f33
71 changed files with 8305 additions and 39 deletions
--- a/lib/maruku/attic/parse_span.rb.txt
+++ b/lib/maruku/attic/parse_span.rb.txt
@ -0,0 +1,462 @@
+#   Copyright (C) 2006  Andrea Censi  <andrea (at) rubyforge.org>
+#
+# This file is part of Maruku.
+# 
+#   Maruku is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or
+#   (at your option) any later version.
+# 
+#   Maruku is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+# 
+#   You should have received a copy of the GNU General Public License
+#   along with Maruku; if not, write to the Free Software
+#   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+
+
+
+#  NOTE: this is the old span-level regexp-based parser.
+#
+#  The new parser is a real parser and is defined with functions in parse_span_better.rb
+#  The new parser is faster, handles syntax errors, but it's absolutely not readable.
+#
+#  Also, regexp parsers simply CANNOT handle inline HTML properly.
+
+
+
+# There are two black-magic methods `match_couple_of` and `map_match`,
+# defined at the end of the file, that make the function 
+# `parse_lines_as_span` so elegant.
+
+class Maruku
+
+	# Takes care of all span-level formatting, links, images, etc.
+	#
+	# Lines must not contain block-level elements.
+	def parse_lines_as_span(lines)
+		
+		# first, get rid of linebreaks
+		res = resolve_linebreaks(lines)
+
+		span = MDElement.new(:dummy, res)
+
+		# encode all escapes
+		span.replace_each_string { |s| s.escape_md_special }
+
+
+# The order of processing is significant: 
+# 1. inline code
+# 2. immediate links
+# 3. inline HTML 
+# 4. everything else
+
+		# search for ``code`` markers
+		span.match_couple_of('``') { |children, match1, match2| 
+			e = create_md_element(:inline_code)
+			# this is now opaque to processing
+			e.meta[:raw_code] = children.join('').it_was_a_code_block
+			e
+		}
+
+		# Search for `single tick`  code markers
+		span.match_couple_of('`') { |children, match1, match2|
+			e = create_md_element(:inline_code)
+			# this is now opaque to processing
+			e.meta[:raw_code] = children.join('').it_was_a_code_block
+			# this is now opaque to processing
+			e
+		}
+		
+		# Detect any immediate link: <http://www.google.com>
+		# we expect an http: or something: at the beginning
+		span.map_match( /<(\w+:[^\>]+)>/) { |match| 
+			url = match[1]
+			
+			e = create_md_element(:immediate_link, [])
+			e.meta[:url] = url
+			e
+		}
+		
+		# Search for inline HTML (the support is pretty basic for now)
+		
+		# this searches for a matching block
+		inlineHTML1 = %r{
+			(   # put everything in 1 
+			<   # open
+			(\w+) # opening tag in 2
+			>   # close
+			.*  # anything
+			</\2> # match closing tag
+			)
+		}x
+
+		# this searches for only one block
+		inlineHTML2 = %r{
+			(   # put everything in 1 
+			<   # open
+			\w+ # 
+			    # close
+			[^<>]*  # anything except
+			/> # closing tag
+			)
+		}x
+		
+		for reg in [inlineHTML1, inlineHTML2]
+			span.map_match(reg) { |match| 
+				raw_html = match[1]
+				convert_raw_html_in_list(raw_html)
+			}
+		end
+		
+		# Detect footnotes references: [^1]
+		span.map_match(/\[(\^[^\]]+)\]/) { |match| 
+			id = match[1].strip.downcase
+			e = create_md_element(:footnote_reference)
+			e.meta[:footnote_id] = id
+			e
+		}
+
+		# Detect any image like ![Alt text][url]
+		span.map_match(/\!\[([^\]]+)\]\s?\[([^\]]*)\]/) { |match|
+			alt = match[1]
+			id = match[2].strip.downcase
+			
+			if id.size == 0
+				id = text.strip.downcase
+			end
+			
+			e = create_md_element(:image)
+			e.meta[:ref_id] = id
+			e
+		}
+
+		# Detect any immage with immediate url: ![Alt](url "title")
+		# a dummy ref is created and put in the symbol table
+		link1 = /!\[([^\]]+)\]\s?\(([^\s\)]*)(?:\s+["'](.*)["'])?\)/
+		span.map_match(link1) { |match|
+			alt = match[1]
+			url = match[2]
+			title = match[3]
+			
+			url = url.strip
+			# create a dummy id
+			id="dummy_#{@refs.size}"
+			@refs[id] = {:url=>url, :title=>title}
+			
+			e = create_md_element(:image)
+			e.meta[:ref_id] = id
+			e
+		}
+
+		# an id reference: "[id]",  "[ id  ]"
+		reg_id_ref = %r{
+			\[ # opening bracket 
+			([^\]]*) # 0 or more non-closing bracket (this is too permissive)
+			\] # closing bracket
+			}x
+			
+		
+		# validates a url, only $1 is set to the url
+ 		reg_url = 
+			/((?:\w+):\/\/(?:\w+:{0,1}\w*@)?(?:\S+)(?::[0-9]+)?(?:\/|\/([\w#!:.?+=&%@!\-\/]))?)/
+		reg_url = %r{([^\s\]\)]+)}
+		
+		# A string enclosed in quotes.
+		reg_title = %r{
+			" # opening
+			[^"]*   # anything = 1
+			" # closing
+			}x
+		
+		# [bah](http://www.google.com "Google.com"), 
+		# [bah](http://www.google.com),
+		# [empty]()
+		reg_url_and_title = %r{
+			\(  # opening
+			\s* # whitespace 
+			#{reg_url}?  # url = 1 might be  empty
+			(?:\s+["'](.*)["'])? # optional title  = 2
+			\s* # whitespace 
+			\) # closing
+		}x
+		
+		# Detect a link like ![Alt text][id]
+		span.map_match(/\[([^\]]+)\]\s?\[([^\]]*)\]/) { |match|
+			text = match[1]
+			id = match[2].strip.downcase
+			
+			if id.size == 0
+				id = text.strip.downcase
+			end
+
+			children = parse_lines_as_span(text)
+			e = create_md_element(:link, children)
+			e.meta[:ref_id] = id
+			e
+		}
+		
+		# Detect any immage with immediate url: ![Alt](url "title")
+		# a dummy ref is created and put in the symbol table
+		link1 = /!\[([^\]]+)\]\s?\(([^\s\)]*)(?:\s+["'](.*)["'])?\)/
+		span.map_match(link1) { |match|
+			text = match[1]
+			children = parse_lines_as_span(text)
+			
+			url = match[2]
+			title = match[3]
+			
+			url = url.strip
+			# create a dummy id
+			id="dummy_#{@refs.size}"
+			@refs[id] = {:url=>url, :title=>title}
+			@refs[id][:title] = title if title
+			
+			e = create_md_element(:link, children)
+			e.meta[:ref_id] = id
+			e
+		}
+		
+
+		# Detect any link like [Google engine][google]
+		span.match_couple_of('[',  # opening bracket
+			%r{\]                   # closing bracket
+			[ ]?                    # optional whitespace
+			#{reg_id_ref} # ref id, with $1 being the reference 
+			}x
+				) { |children, match1, match2| 
+			id = match2[1]
+			id = id.strip.downcase
+			
+			if id.size == 0
+				id = children.join.strip.downcase
+			end
+			
+			e = create_md_element(:link, children)
+			e.meta[:ref_id] = id
+			e
+		}
+
+		# Detect any link with immediate url: [Google](http://www.google.com)
+		# XXX Note that the url can be empty: [Empty]()
+		# a dummy ref is created and put in the symbol table
+		span.match_couple_of('[',  # opening bracket
+				%r{\]                   # closing bracket
+				[ ]?                    # optional whitespace
+				#{reg_url_and_title}    # ref id, with $1 being the url and $2 being the title
+				}x
+					) { |children, match1, match2| 
+			
+			url   = match2[1]
+			title = match2[3] # XXX? Is it a bug? I would use [2]
+			 
+			# create a dummy id
+			id="dummy_#{@refs.size}"
+			@refs[id] = {:url=>url}
+			@refs[id][:title] = title if title
+
+			e = create_md_element(:link, children)
+			e.meta[:ref_id] = id
+			e
+		}
+
+		# Detect an email address <andrea@invalid.it>
+		span.map_match(EMailAddress) { |match| 
+			email = match[1]
+			e = create_md_element(:email_address, [])
+			e.meta[:email] = email
+			e
+		}
+		
+		# Detect HTML entitis
+		span.map_match(/&([\w\d]+);/) { |match| 
+			entity_name = match[1]
+
+			e = create_md_element(:entity, [])
+			e.meta[:entity_name] = entity_name
+			e
+		}
+
+
+		# And now the easy stuff
+
+		# search for ***strong and em***
+		span.match_couple_of('***') { |children,m1,m2|  
+			create_md_element(:strong, [create_md_element(:emphasis, children)] ) }
+
+		span.match_couple_of('___') { |children,m1,m2|  
+			create_md_element(:strong, [create_md_element(:emphasis, children)] ) }
+	
+		# search for **strong**
+		span.match_couple_of('**') { |children,m1,m2|  create_md_element(:strong,   children) }
+
+		# search for __strong__
+		span.match_couple_of('__') { |children,m1,m2|  create_md_element(:strong,   children) }
+
+		# search for *emphasis*
+		span.match_couple_of('*')  { |children,m1,m2|  create_md_element(:emphasis, children) }
+		
+		# search for _emphasis_
+		span.match_couple_of('_')  { |children,m1,m2|  create_md_element(:emphasis, children) }
+		
+		# finally, unescape the special characters
+		span.replace_each_string { |s|  s.unescape_md_special}
+		
+		span.children
+	end
+	
+	# returns array containing Strings or :linebreak elements
+	def resolve_linebreaks(lines)
+		res = []
+		s = ""
+		lines.each do |l| 
+			s += (s.size>0 ? " " : "") + l.strip
+			if force_linebreak?(l)
+				res << s
+				res << create_md_element(:linebreak)
+				s = ""
+			end
+		end
+		res << s if s.size > 0
+		res
+	end
+
+	# raw_html is something like 
+	#  <em> A</em> dopwkk *maruk* <em>A</em>  
+	def convert_raw_html_in_list(raw_html)
+		e = create_md_element(:raw_html)
+		e.meta[:raw_html]  = raw_html
+		begin
+			e.meta[:parsed_html] = Document.new(raw_html)
+		rescue 
+			$stderr.puts "convert_raw_html_in_list Malformed HTML:\n#{raw_html}"
+		end
+		e
+	end
+
+end
+
+# And now the black magic that makes the part above so elegant
+class MDElement	
+	
+	# Try to match the regexp to each string in the hierarchy
+	# (using `replace_each_string`). If the regexp match, eliminate
+	# the matching string and substitute it with the pre_match, the
+	# result of the block, and the post_match
+	#
+	#   ..., matched_string, ... -> ..., pre_match, block.call(match), post_match
+	#
+	# the block might return arrays.
+	#
+	def map_match(regexp, &block)
+		replace_each_string { |s| 
+			processed = []
+			while (match = regexp.match(s))
+				# save the pre_match
+				processed << match.pre_match if match.pre_match && match.pre_match.size>0
+				# transform match
+				result = block.call(match)
+				# and append as processed
+				[*result].each do |e| processed << e end
+				# go on with the rest of the string
+				s = match.post_match 
+			end
+			processed << s if s.size > 0
+			processed
+		}
+	end
+	
+	# Finds couple of delimiters in a hierarchy of Strings and MDElements
+	#
+	# Open and close are two delimiters (like '[' and ']'), or two Regexp.
+	#
+	# If you don't pass close, it defaults to open.
+	#
+	# Each block is called with |contained children, match1, match2|
+	def match_couple_of(open, close=nil, &block)
+		close = close || open
+		 open_regexp =  open.kind_of?(Regexp) ?  open : Regexp.new(Regexp.escape(open))
+		close_regexp = close.kind_of?(Regexp) ? close : Regexp.new(Regexp.escape(close))
+		
+		# Do the same to children first
+		for c in @children; if c.kind_of? MDElement
+			c.match_couple_of(open_regexp, close_regexp, &block)
+		end end
+		
+		processed_children = []
+		
+		until @children.empty?
+			c = @children.shift
+			if c.kind_of? String
+				match1 = open_regexp.match(c)
+				if not match1
+					processed_children << c
+				else # we found opening, now search closing
+#					puts "Found opening (#{marker}) in #{c.inspect}"
+					# pre match is processed
+					processed_children.push match1.pre_match if 
+						match1.pre_match && match1.pre_match.size > 0
+					# we will process again the post_match
+					@children.unshift match1.post_match if 
+						match1.post_match && match1.post_match.size>0
+					
+					contained = []; found_closing = false
+					until @children.empty?  || found_closing
+						c = @children.shift
+						if c.kind_of? String
+							match2 = close_regexp.match(c)
+							if not match2 
+								contained << c
+							else
+								# we found closing
+								found_closing = true
+								# pre match is contained
+								contained.push match2.pre_match if 
+									match2.pre_match && match2.pre_match.size>0
+								# we will process again the post_match
+								@children.unshift match2.post_match if 
+									match2.post_match && match2.post_match.size>0
+
+								# And now we call the block
+								substitute = block.call(contained, match1, match2) 
+								processed_children  << substitute
+								
+#								puts "Found closing (#{marker}) in #{c.inspect}"
+#								puts "Children: #{contained.inspect}"
+#								puts "Substitute: #{substitute.inspect}"
+							end
+						else
+							contained << c
+						end
+					end
+					
+					if not found_closing
+						# $stderr.puts "##### Could not find closing for #{open}, #{close} -- ignoring"
+						processed_children << match1.to_s
+						contained.reverse.each do |c|
+							@children.unshift c
+						end
+					end
+				end
+			else
+				processed_children << c
+			end
+		end
+		
+		raise "BugBug" unless @children.empty?
+		
+		rebuilt = []
+		# rebuild strings
+		processed_children.each do |c|
+			if c.kind_of?(String) && rebuilt.last && rebuilt.last.kind_of?(String)
+				rebuilt.last << c
+			else
+				rebuilt << c
+			end
+		end
+		@children = rebuilt
+	end
+end