instiki/lib/maruku/attic/parse_span.rb.txt
2007-01-22 08:36:51 -06:00

462 lines
12 KiB
Text

# Copyright (C) 2006 Andrea Censi <andrea (at) rubyforge.org>
#
# This file is part of Maruku.
#
# Maruku is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# Maruku is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Maruku; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
# NOTE: this is the old span-level regexp-based parser.
#
# The new parser is a real parser and is defined with functions in parse_span_better.rb
# The new parser is faster, handles syntax errors, but it's absolutely not readable.
#
# Also, regexp parsers simply CANNOT handle inline HTML properly.
# There are two black-magic methods `match_couple_of` and `map_match`,
# defined at the end of the file, that make the function
# `parse_lines_as_span` so elegant.
class Maruku
# Takes care of all span-level formatting, links, images, etc.
#
# Lines must not contain block-level elements.
def parse_lines_as_span(lines)
# first, get rid of linebreaks
res = resolve_linebreaks(lines)
span = MDElement.new(:dummy, res)
# encode all escapes
span.replace_each_string { |s| s.escape_md_special }
# The order of processing is significant:
# 1. inline code
# 2. immediate links
# 3. inline HTML
# 4. everything else
# search for ``code`` markers
span.match_couple_of('``') { |children, match1, match2|
e = create_md_element(:inline_code)
# this is now opaque to processing
e.meta[:raw_code] = children.join('').it_was_a_code_block
e
}
# Search for `single tick` code markers
span.match_couple_of('`') { |children, match1, match2|
e = create_md_element(:inline_code)
# this is now opaque to processing
e.meta[:raw_code] = children.join('').it_was_a_code_block
# this is now opaque to processing
e
}
# Detect any immediate link: <http://www.google.com>
# we expect an http: or something: at the beginning
span.map_match( /<(\w+:[^\>]+)>/) { |match|
url = match[1]
e = create_md_element(:immediate_link, [])
e.meta[:url] = url
e
}
# Search for inline HTML (the support is pretty basic for now)
# this searches for a matching block
inlineHTML1 = %r{
( # put everything in 1
< # open
(\w+) # opening tag in 2
> # close
.* # anything
</\2> # match closing tag
)
}x
# this searches for only one block
inlineHTML2 = %r{
( # put everything in 1
< # open
\w+ #
# close
[^<>]* # anything except
/> # closing tag
)
}x
for reg in [inlineHTML1, inlineHTML2]
span.map_match(reg) { |match|
raw_html = match[1]
convert_raw_html_in_list(raw_html)
}
end
# Detect footnotes references: [^1]
span.map_match(/\[(\^[^\]]+)\]/) { |match|
id = match[1].strip.downcase
e = create_md_element(:footnote_reference)
e.meta[:footnote_id] = id
e
}
# Detect any image like ![Alt text][url]
span.map_match(/\!\[([^\]]+)\]\s?\[([^\]]*)\]/) { |match|
alt = match[1]
id = match[2].strip.downcase
if id.size == 0
id = text.strip.downcase
end
e = create_md_element(:image)
e.meta[:ref_id] = id
e
}
# Detect any immage with immediate url: ![Alt](url "title")
# a dummy ref is created and put in the symbol table
link1 = /!\[([^\]]+)\]\s?\(([^\s\)]*)(?:\s+["'](.*)["'])?\)/
span.map_match(link1) { |match|
alt = match[1]
url = match[2]
title = match[3]
url = url.strip
# create a dummy id
id="dummy_#{@refs.size}"
@refs[id] = {:url=>url, :title=>title}
e = create_md_element(:image)
e.meta[:ref_id] = id
e
}
# an id reference: "[id]", "[ id ]"
reg_id_ref = %r{
\[ # opening bracket
([^\]]*) # 0 or more non-closing bracket (this is too permissive)
\] # closing bracket
}x
# validates a url, only $1 is set to the url
reg_url =
/((?:\w+):\/\/(?:\w+:{0,1}\w*@)?(?:\S+)(?::[0-9]+)?(?:\/|\/([\w#!:.?+=&%@!\-\/]))?)/
reg_url = %r{([^\s\]\)]+)}
# A string enclosed in quotes.
reg_title = %r{
" # opening
[^"]* # anything = 1
" # closing
}x
# [bah](http://www.google.com "Google.com"),
# [bah](http://www.google.com),
# [empty]()
reg_url_and_title = %r{
\( # opening
\s* # whitespace
#{reg_url}? # url = 1 might be empty
(?:\s+["'](.*)["'])? # optional title = 2
\s* # whitespace
\) # closing
}x
# Detect a link like ![Alt text][id]
span.map_match(/\[([^\]]+)\]\s?\[([^\]]*)\]/) { |match|
text = match[1]
id = match[2].strip.downcase
if id.size == 0
id = text.strip.downcase
end
children = parse_lines_as_span(text)
e = create_md_element(:link, children)
e.meta[:ref_id] = id
e
}
# Detect any immage with immediate url: ![Alt](url "title")
# a dummy ref is created and put in the symbol table
link1 = /!\[([^\]]+)\]\s?\(([^\s\)]*)(?:\s+["'](.*)["'])?\)/
span.map_match(link1) { |match|
text = match[1]
children = parse_lines_as_span(text)
url = match[2]
title = match[3]
url = url.strip
# create a dummy id
id="dummy_#{@refs.size}"
@refs[id] = {:url=>url, :title=>title}
@refs[id][:title] = title if title
e = create_md_element(:link, children)
e.meta[:ref_id] = id
e
}
# Detect any link like [Google engine][google]
span.match_couple_of('[', # opening bracket
%r{\] # closing bracket
[ ]? # optional whitespace
#{reg_id_ref} # ref id, with $1 being the reference
}x
) { |children, match1, match2|
id = match2[1]
id = id.strip.downcase
if id.size == 0
id = children.join.strip.downcase
end
e = create_md_element(:link, children)
e.meta[:ref_id] = id
e
}
# Detect any link with immediate url: [Google](http://www.google.com)
# XXX Note that the url can be empty: [Empty]()
# a dummy ref is created and put in the symbol table
span.match_couple_of('[', # opening bracket
%r{\] # closing bracket
[ ]? # optional whitespace
#{reg_url_and_title} # ref id, with $1 being the url and $2 being the title
}x
) { |children, match1, match2|
url = match2[1]
title = match2[3] # XXX? Is it a bug? I would use [2]
# create a dummy id
id="dummy_#{@refs.size}"
@refs[id] = {:url=>url}
@refs[id][:title] = title if title
e = create_md_element(:link, children)
e.meta[:ref_id] = id
e
}
# Detect an email address <andrea@invalid.it>
span.map_match(EMailAddress) { |match|
email = match[1]
e = create_md_element(:email_address, [])
e.meta[:email] = email
e
}
# Detect HTML entitis
span.map_match(/&([\w\d]+);/) { |match|
entity_name = match[1]
e = create_md_element(:entity, [])
e.meta[:entity_name] = entity_name
e
}
# And now the easy stuff
# search for ***strong and em***
span.match_couple_of('***') { |children,m1,m2|
create_md_element(:strong, [create_md_element(:emphasis, children)] ) }
span.match_couple_of('___') { |children,m1,m2|
create_md_element(:strong, [create_md_element(:emphasis, children)] ) }
# search for **strong**
span.match_couple_of('**') { |children,m1,m2| create_md_element(:strong, children) }
# search for __strong__
span.match_couple_of('__') { |children,m1,m2| create_md_element(:strong, children) }
# search for *emphasis*
span.match_couple_of('*') { |children,m1,m2| create_md_element(:emphasis, children) }
# search for _emphasis_
span.match_couple_of('_') { |children,m1,m2| create_md_element(:emphasis, children) }
# finally, unescape the special characters
span.replace_each_string { |s| s.unescape_md_special}
span.children
end
# returns array containing Strings or :linebreak elements
def resolve_linebreaks(lines)
res = []
s = ""
lines.each do |l|
s += (s.size>0 ? " " : "") + l.strip
if force_linebreak?(l)
res << s
res << create_md_element(:linebreak)
s = ""
end
end
res << s if s.size > 0
res
end
# raw_html is something like
# <em> A</em> dopwkk *maruk* <em>A</em>
def convert_raw_html_in_list(raw_html)
e = create_md_element(:raw_html)
e.meta[:raw_html] = raw_html
begin
e.meta[:parsed_html] = Document.new(raw_html)
rescue
$stderr.puts "convert_raw_html_in_list Malformed HTML:\n#{raw_html}"
end
e
end
end
# And now the black magic that makes the part above so elegant
class MDElement
# Try to match the regexp to each string in the hierarchy
# (using `replace_each_string`). If the regexp match, eliminate
# the matching string and substitute it with the pre_match, the
# result of the block, and the post_match
#
# ..., matched_string, ... -> ..., pre_match, block.call(match), post_match
#
# the block might return arrays.
#
def map_match(regexp, &block)
replace_each_string { |s|
processed = []
while (match = regexp.match(s))
# save the pre_match
processed << match.pre_match if match.pre_match && match.pre_match.size>0
# transform match
result = block.call(match)
# and append as processed
[*result].each do |e| processed << e end
# go on with the rest of the string
s = match.post_match
end
processed << s if s.size > 0
processed
}
end
# Finds couple of delimiters in a hierarchy of Strings and MDElements
#
# Open and close are two delimiters (like '[' and ']'), or two Regexp.
#
# If you don't pass close, it defaults to open.
#
# Each block is called with |contained children, match1, match2|
def match_couple_of(open, close=nil, &block)
close = close || open
open_regexp = open.kind_of?(Regexp) ? open : Regexp.new(Regexp.escape(open))
close_regexp = close.kind_of?(Regexp) ? close : Regexp.new(Regexp.escape(close))
# Do the same to children first
for c in @children; if c.kind_of? MDElement
c.match_couple_of(open_regexp, close_regexp, &block)
end end
processed_children = []
until @children.empty?
c = @children.shift
if c.kind_of? String
match1 = open_regexp.match(c)
if not match1
processed_children << c
else # we found opening, now search closing
# puts "Found opening (#{marker}) in #{c.inspect}"
# pre match is processed
processed_children.push match1.pre_match if
match1.pre_match && match1.pre_match.size > 0
# we will process again the post_match
@children.unshift match1.post_match if
match1.post_match && match1.post_match.size>0
contained = []; found_closing = false
until @children.empty? || found_closing
c = @children.shift
if c.kind_of? String
match2 = close_regexp.match(c)
if not match2
contained << c
else
# we found closing
found_closing = true
# pre match is contained
contained.push match2.pre_match if
match2.pre_match && match2.pre_match.size>0
# we will process again the post_match
@children.unshift match2.post_match if
match2.post_match && match2.post_match.size>0
# And now we call the block
substitute = block.call(contained, match1, match2)
processed_children << substitute
# puts "Found closing (#{marker}) in #{c.inspect}"
# puts "Children: #{contained.inspect}"
# puts "Substitute: #{substitute.inspect}"
end
else
contained << c
end
end
if not found_closing
# $stderr.puts "##### Could not find closing for #{open}, #{close} -- ignoring"
processed_children << match1.to_s
contained.reverse.each do |c|
@children.unshift c
end
end
end
else
processed_children << c
end
end
raise "BugBug" unless @children.empty?
rebuilt = []
# rebuild strings
processed_children.each do |c|
if c.kind_of?(String) && rebuilt.last && rebuilt.last.kind_of?(String)
rebuilt.last << c
else
rebuilt << c
end
end
@children = rebuilt
end
end