463 lines
12 KiB
Text
463 lines
12 KiB
Text
|
# Copyright (C) 2006 Andrea Censi <andrea (at) rubyforge.org>
|
||
|
#
|
||
|
# This file is part of Maruku.
|
||
|
#
|
||
|
# Maruku is free software; you can redistribute it and/or modify
|
||
|
# it under the terms of the GNU General Public License as published by
|
||
|
# the Free Software Foundation; either version 2 of the License, or
|
||
|
# (at your option) any later version.
|
||
|
#
|
||
|
# Maruku is distributed in the hope that it will be useful,
|
||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
# GNU General Public License for more details.
|
||
|
#
|
||
|
# You should have received a copy of the GNU General Public License
|
||
|
# along with Maruku; if not, write to the Free Software
|
||
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
# NOTE: this is the old span-level regexp-based parser.
|
||
|
#
|
||
|
# The new parser is a real parser and is defined with functions in parse_span_better.rb
|
||
|
# The new parser is faster, handles syntax errors, but it's absolutely not readable.
|
||
|
#
|
||
|
# Also, regexp parsers simply CANNOT handle inline HTML properly.
|
||
|
|
||
|
|
||
|
|
||
|
# There are two black-magic methods `match_couple_of` and `map_match`,
|
||
|
# defined at the end of the file, that make the function
|
||
|
# `parse_lines_as_span` so elegant.
|
||
|
|
||
|
class Maruku
|
||
|
|
||
|
# Takes care of all span-level formatting, links, images, etc.
|
||
|
#
|
||
|
# Lines must not contain block-level elements.
|
||
|
def parse_lines_as_span(lines)
|
||
|
|
||
|
# first, get rid of linebreaks
|
||
|
res = resolve_linebreaks(lines)
|
||
|
|
||
|
span = MDElement.new(:dummy, res)
|
||
|
|
||
|
# encode all escapes
|
||
|
span.replace_each_string { |s| s.escape_md_special }
|
||
|
|
||
|
|
||
|
# The order of processing is significant:
|
||
|
# 1. inline code
|
||
|
# 2. immediate links
|
||
|
# 3. inline HTML
|
||
|
# 4. everything else
|
||
|
|
||
|
# search for ``code`` markers
|
||
|
span.match_couple_of('``') { |children, match1, match2|
|
||
|
e = create_md_element(:inline_code)
|
||
|
# this is now opaque to processing
|
||
|
e.meta[:raw_code] = children.join('').it_was_a_code_block
|
||
|
e
|
||
|
}
|
||
|
|
||
|
# Search for `single tick` code markers
|
||
|
span.match_couple_of('`') { |children, match1, match2|
|
||
|
e = create_md_element(:inline_code)
|
||
|
# this is now opaque to processing
|
||
|
e.meta[:raw_code] = children.join('').it_was_a_code_block
|
||
|
# this is now opaque to processing
|
||
|
e
|
||
|
}
|
||
|
|
||
|
# Detect any immediate link: <http://www.google.com>
|
||
|
# we expect an http: or something: at the beginning
|
||
|
span.map_match( /<(\w+:[^\>]+)>/) { |match|
|
||
|
url = match[1]
|
||
|
|
||
|
e = create_md_element(:immediate_link, [])
|
||
|
e.meta[:url] = url
|
||
|
e
|
||
|
}
|
||
|
|
||
|
# Search for inline HTML (the support is pretty basic for now)
|
||
|
|
||
|
# this searches for a matching block
|
||
|
inlineHTML1 = %r{
|
||
|
( # put everything in 1
|
||
|
< # open
|
||
|
(\w+) # opening tag in 2
|
||
|
> # close
|
||
|
.* # anything
|
||
|
</\2> # match closing tag
|
||
|
)
|
||
|
}x
|
||
|
|
||
|
# this searches for only one block
|
||
|
inlineHTML2 = %r{
|
||
|
( # put everything in 1
|
||
|
< # open
|
||
|
\w+ #
|
||
|
# close
|
||
|
[^<>]* # anything except
|
||
|
/> # closing tag
|
||
|
)
|
||
|
}x
|
||
|
|
||
|
for reg in [inlineHTML1, inlineHTML2]
|
||
|
span.map_match(reg) { |match|
|
||
|
raw_html = match[1]
|
||
|
convert_raw_html_in_list(raw_html)
|
||
|
}
|
||
|
end
|
||
|
|
||
|
# Detect footnotes references: [^1]
|
||
|
span.map_match(/\[(\^[^\]]+)\]/) { |match|
|
||
|
id = match[1].strip.downcase
|
||
|
e = create_md_element(:footnote_reference)
|
||
|
e.meta[:footnote_id] = id
|
||
|
e
|
||
|
}
|
||
|
|
||
|
# Detect any image like ![Alt text][url]
|
||
|
span.map_match(/\!\[([^\]]+)\]\s?\[([^\]]*)\]/) { |match|
|
||
|
alt = match[1]
|
||
|
id = match[2].strip.downcase
|
||
|
|
||
|
if id.size == 0
|
||
|
id = text.strip.downcase
|
||
|
end
|
||
|
|
||
|
e = create_md_element(:image)
|
||
|
e.meta[:ref_id] = id
|
||
|
e
|
||
|
}
|
||
|
|
||
|
# Detect any immage with immediate url: ![Alt](url "title")
|
||
|
# a dummy ref is created and put in the symbol table
|
||
|
link1 = /!\[([^\]]+)\]\s?\(([^\s\)]*)(?:\s+["'](.*)["'])?\)/
|
||
|
span.map_match(link1) { |match|
|
||
|
alt = match[1]
|
||
|
url = match[2]
|
||
|
title = match[3]
|
||
|
|
||
|
url = url.strip
|
||
|
# create a dummy id
|
||
|
id="dummy_#{@refs.size}"
|
||
|
@refs[id] = {:url=>url, :title=>title}
|
||
|
|
||
|
e = create_md_element(:image)
|
||
|
e.meta[:ref_id] = id
|
||
|
e
|
||
|
}
|
||
|
|
||
|
# an id reference: "[id]", "[ id ]"
|
||
|
reg_id_ref = %r{
|
||
|
\[ # opening bracket
|
||
|
([^\]]*) # 0 or more non-closing bracket (this is too permissive)
|
||
|
\] # closing bracket
|
||
|
}x
|
||
|
|
||
|
|
||
|
# validates a url, only $1 is set to the url
|
||
|
reg_url =
|
||
|
/((?:\w+):\/\/(?:\w+:{0,1}\w*@)?(?:\S+)(?::[0-9]+)?(?:\/|\/([\w#!:.?+=&%@!\-\/]))?)/
|
||
|
reg_url = %r{([^\s\]\)]+)}
|
||
|
|
||
|
# A string enclosed in quotes.
|
||
|
reg_title = %r{
|
||
|
" # opening
|
||
|
[^"]* # anything = 1
|
||
|
" # closing
|
||
|
}x
|
||
|
|
||
|
# [bah](http://www.google.com "Google.com"),
|
||
|
# [bah](http://www.google.com),
|
||
|
# [empty]()
|
||
|
reg_url_and_title = %r{
|
||
|
\( # opening
|
||
|
\s* # whitespace
|
||
|
#{reg_url}? # url = 1 might be empty
|
||
|
(?:\s+["'](.*)["'])? # optional title = 2
|
||
|
\s* # whitespace
|
||
|
\) # closing
|
||
|
}x
|
||
|
|
||
|
# Detect a link like ![Alt text][id]
|
||
|
span.map_match(/\[([^\]]+)\]\s?\[([^\]]*)\]/) { |match|
|
||
|
text = match[1]
|
||
|
id = match[2].strip.downcase
|
||
|
|
||
|
if id.size == 0
|
||
|
id = text.strip.downcase
|
||
|
end
|
||
|
|
||
|
children = parse_lines_as_span(text)
|
||
|
e = create_md_element(:link, children)
|
||
|
e.meta[:ref_id] = id
|
||
|
e
|
||
|
}
|
||
|
|
||
|
# Detect any immage with immediate url: ![Alt](url "title")
|
||
|
# a dummy ref is created and put in the symbol table
|
||
|
link1 = /!\[([^\]]+)\]\s?\(([^\s\)]*)(?:\s+["'](.*)["'])?\)/
|
||
|
span.map_match(link1) { |match|
|
||
|
text = match[1]
|
||
|
children = parse_lines_as_span(text)
|
||
|
|
||
|
url = match[2]
|
||
|
title = match[3]
|
||
|
|
||
|
url = url.strip
|
||
|
# create a dummy id
|
||
|
id="dummy_#{@refs.size}"
|
||
|
@refs[id] = {:url=>url, :title=>title}
|
||
|
@refs[id][:title] = title if title
|
||
|
|
||
|
e = create_md_element(:link, children)
|
||
|
e.meta[:ref_id] = id
|
||
|
e
|
||
|
}
|
||
|
|
||
|
|
||
|
# Detect any link like [Google engine][google]
|
||
|
span.match_couple_of('[', # opening bracket
|
||
|
%r{\] # closing bracket
|
||
|
[ ]? # optional whitespace
|
||
|
#{reg_id_ref} # ref id, with $1 being the reference
|
||
|
}x
|
||
|
) { |children, match1, match2|
|
||
|
id = match2[1]
|
||
|
id = id.strip.downcase
|
||
|
|
||
|
if id.size == 0
|
||
|
id = children.join.strip.downcase
|
||
|
end
|
||
|
|
||
|
e = create_md_element(:link, children)
|
||
|
e.meta[:ref_id] = id
|
||
|
e
|
||
|
}
|
||
|
|
||
|
# Detect any link with immediate url: [Google](http://www.google.com)
|
||
|
# XXX Note that the url can be empty: [Empty]()
|
||
|
# a dummy ref is created and put in the symbol table
|
||
|
span.match_couple_of('[', # opening bracket
|
||
|
%r{\] # closing bracket
|
||
|
[ ]? # optional whitespace
|
||
|
#{reg_url_and_title} # ref id, with $1 being the url and $2 being the title
|
||
|
}x
|
||
|
) { |children, match1, match2|
|
||
|
|
||
|
url = match2[1]
|
||
|
title = match2[3] # XXX? Is it a bug? I would use [2]
|
||
|
|
||
|
# create a dummy id
|
||
|
id="dummy_#{@refs.size}"
|
||
|
@refs[id] = {:url=>url}
|
||
|
@refs[id][:title] = title if title
|
||
|
|
||
|
e = create_md_element(:link, children)
|
||
|
e.meta[:ref_id] = id
|
||
|
e
|
||
|
}
|
||
|
|
||
|
# Detect an email address <andrea@invalid.it>
|
||
|
span.map_match(EMailAddress) { |match|
|
||
|
email = match[1]
|
||
|
e = create_md_element(:email_address, [])
|
||
|
e.meta[:email] = email
|
||
|
e
|
||
|
}
|
||
|
|
||
|
# Detect HTML entitis
|
||
|
span.map_match(/&([\w\d]+);/) { |match|
|
||
|
entity_name = match[1]
|
||
|
|
||
|
e = create_md_element(:entity, [])
|
||
|
e.meta[:entity_name] = entity_name
|
||
|
e
|
||
|
}
|
||
|
|
||
|
|
||
|
# And now the easy stuff
|
||
|
|
||
|
# search for ***strong and em***
|
||
|
span.match_couple_of('***') { |children,m1,m2|
|
||
|
create_md_element(:strong, [create_md_element(:emphasis, children)] ) }
|
||
|
|
||
|
span.match_couple_of('___') { |children,m1,m2|
|
||
|
create_md_element(:strong, [create_md_element(:emphasis, children)] ) }
|
||
|
|
||
|
# search for **strong**
|
||
|
span.match_couple_of('**') { |children,m1,m2| create_md_element(:strong, children) }
|
||
|
|
||
|
# search for __strong__
|
||
|
span.match_couple_of('__') { |children,m1,m2| create_md_element(:strong, children) }
|
||
|
|
||
|
# search for *emphasis*
|
||
|
span.match_couple_of('*') { |children,m1,m2| create_md_element(:emphasis, children) }
|
||
|
|
||
|
# search for _emphasis_
|
||
|
span.match_couple_of('_') { |children,m1,m2| create_md_element(:emphasis, children) }
|
||
|
|
||
|
# finally, unescape the special characters
|
||
|
span.replace_each_string { |s| s.unescape_md_special}
|
||
|
|
||
|
span.children
|
||
|
end
|
||
|
|
||
|
# returns array containing Strings or :linebreak elements
|
||
|
def resolve_linebreaks(lines)
|
||
|
res = []
|
||
|
s = ""
|
||
|
lines.each do |l|
|
||
|
s += (s.size>0 ? " " : "") + l.strip
|
||
|
if force_linebreak?(l)
|
||
|
res << s
|
||
|
res << create_md_element(:linebreak)
|
||
|
s = ""
|
||
|
end
|
||
|
end
|
||
|
res << s if s.size > 0
|
||
|
res
|
||
|
end
|
||
|
|
||
|
# raw_html is something like
|
||
|
# <em> A</em> dopwkk *maruk* <em>A</em>
|
||
|
def convert_raw_html_in_list(raw_html)
|
||
|
e = create_md_element(:raw_html)
|
||
|
e.meta[:raw_html] = raw_html
|
||
|
begin
|
||
|
e.meta[:parsed_html] = Document.new(raw_html)
|
||
|
rescue
|
||
|
$stderr.puts "convert_raw_html_in_list Malformed HTML:\n#{raw_html}"
|
||
|
end
|
||
|
e
|
||
|
end
|
||
|
|
||
|
end
|
||
|
|
||
|
# And now the black magic that makes the part above so elegant
|
||
|
class MDElement
|
||
|
|
||
|
# Try to match the regexp to each string in the hierarchy
|
||
|
# (using `replace_each_string`). If the regexp match, eliminate
|
||
|
# the matching string and substitute it with the pre_match, the
|
||
|
# result of the block, and the post_match
|
||
|
#
|
||
|
# ..., matched_string, ... -> ..., pre_match, block.call(match), post_match
|
||
|
#
|
||
|
# the block might return arrays.
|
||
|
#
|
||
|
def map_match(regexp, &block)
|
||
|
replace_each_string { |s|
|
||
|
processed = []
|
||
|
while (match = regexp.match(s))
|
||
|
# save the pre_match
|
||
|
processed << match.pre_match if match.pre_match && match.pre_match.size>0
|
||
|
# transform match
|
||
|
result = block.call(match)
|
||
|
# and append as processed
|
||
|
[*result].each do |e| processed << e end
|
||
|
# go on with the rest of the string
|
||
|
s = match.post_match
|
||
|
end
|
||
|
processed << s if s.size > 0
|
||
|
processed
|
||
|
}
|
||
|
end
|
||
|
|
||
|
# Finds couple of delimiters in a hierarchy of Strings and MDElements
|
||
|
#
|
||
|
# Open and close are two delimiters (like '[' and ']'), or two Regexp.
|
||
|
#
|
||
|
# If you don't pass close, it defaults to open.
|
||
|
#
|
||
|
# Each block is called with |contained children, match1, match2|
|
||
|
def match_couple_of(open, close=nil, &block)
|
||
|
close = close || open
|
||
|
open_regexp = open.kind_of?(Regexp) ? open : Regexp.new(Regexp.escape(open))
|
||
|
close_regexp = close.kind_of?(Regexp) ? close : Regexp.new(Regexp.escape(close))
|
||
|
|
||
|
# Do the same to children first
|
||
|
for c in @children; if c.kind_of? MDElement
|
||
|
c.match_couple_of(open_regexp, close_regexp, &block)
|
||
|
end end
|
||
|
|
||
|
processed_children = []
|
||
|
|
||
|
until @children.empty?
|
||
|
c = @children.shift
|
||
|
if c.kind_of? String
|
||
|
match1 = open_regexp.match(c)
|
||
|
if not match1
|
||
|
processed_children << c
|
||
|
else # we found opening, now search closing
|
||
|
# puts "Found opening (#{marker}) in #{c.inspect}"
|
||
|
# pre match is processed
|
||
|
processed_children.push match1.pre_match if
|
||
|
match1.pre_match && match1.pre_match.size > 0
|
||
|
# we will process again the post_match
|
||
|
@children.unshift match1.post_match if
|
||
|
match1.post_match && match1.post_match.size>0
|
||
|
|
||
|
contained = []; found_closing = false
|
||
|
until @children.empty? || found_closing
|
||
|
c = @children.shift
|
||
|
if c.kind_of? String
|
||
|
match2 = close_regexp.match(c)
|
||
|
if not match2
|
||
|
contained << c
|
||
|
else
|
||
|
# we found closing
|
||
|
found_closing = true
|
||
|
# pre match is contained
|
||
|
contained.push match2.pre_match if
|
||
|
match2.pre_match && match2.pre_match.size>0
|
||
|
# we will process again the post_match
|
||
|
@children.unshift match2.post_match if
|
||
|
match2.post_match && match2.post_match.size>0
|
||
|
|
||
|
# And now we call the block
|
||
|
substitute = block.call(contained, match1, match2)
|
||
|
processed_children << substitute
|
||
|
|
||
|
# puts "Found closing (#{marker}) in #{c.inspect}"
|
||
|
# puts "Children: #{contained.inspect}"
|
||
|
# puts "Substitute: #{substitute.inspect}"
|
||
|
end
|
||
|
else
|
||
|
contained << c
|
||
|
end
|
||
|
end
|
||
|
|
||
|
if not found_closing
|
||
|
# $stderr.puts "##### Could not find closing for #{open}, #{close} -- ignoring"
|
||
|
processed_children << match1.to_s
|
||
|
contained.reverse.each do |c|
|
||
|
@children.unshift c
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
else
|
||
|
processed_children << c
|
||
|
end
|
||
|
end
|
||
|
|
||
|
raise "BugBug" unless @children.empty?
|
||
|
|
||
|
rebuilt = []
|
||
|
# rebuild strings
|
||
|
processed_children.each do |c|
|
||
|
if c.kind_of?(String) && rebuilt.last && rebuilt.last.kind_of?(String)
|
||
|
rebuilt.last << c
|
||
|
else
|
||
|
rebuilt << c
|
||
|
end
|
||
|
end
|
||
|
@children = rebuilt
|
||
|
end
|
||
|
end
|