Bring up to current.
This commit is contained in:
parent
69b62b6f33
commit
b19e1e4f47
71 changed files with 8305 additions and 39 deletions
462
lib/maruku/attic/parse_span.rb.txt
Normal file
462
lib/maruku/attic/parse_span.rb.txt
Normal file
|
@ -0,0 +1,462 @@
|
|||
# Copyright (C) 2006 Andrea Censi <andrea (at) rubyforge.org>
|
||||
#
|
||||
# This file is part of Maruku.
|
||||
#
|
||||
# Maruku is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# Maruku is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with Maruku; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
|
||||
|
||||
|
||||
# NOTE: this is the old span-level regexp-based parser.
|
||||
#
|
||||
# The new parser is a real parser and is defined with functions in parse_span_better.rb
|
||||
# The new parser is faster, handles syntax errors, but it's absolutely not readable.
|
||||
#
|
||||
# Also, regexp parsers simply CANNOT handle inline HTML properly.
|
||||
|
||||
|
||||
|
||||
# There are two black-magic methods `match_couple_of` and `map_match`,
|
||||
# defined at the end of the file, that make the function
|
||||
# `parse_lines_as_span` so elegant.
|
||||
|
||||
class Maruku
|
||||
|
||||
# Takes care of all span-level formatting, links, images, etc.
|
||||
#
|
||||
# Lines must not contain block-level elements.
|
||||
def parse_lines_as_span(lines)
|
||||
|
||||
# first, get rid of linebreaks
|
||||
res = resolve_linebreaks(lines)
|
||||
|
||||
span = MDElement.new(:dummy, res)
|
||||
|
||||
# encode all escapes
|
||||
span.replace_each_string { |s| s.escape_md_special }
|
||||
|
||||
|
||||
# The order of processing is significant:
|
||||
# 1. inline code
|
||||
# 2. immediate links
|
||||
# 3. inline HTML
|
||||
# 4. everything else
|
||||
|
||||
# search for ``code`` markers
|
||||
span.match_couple_of('``') { |children, match1, match2|
|
||||
e = create_md_element(:inline_code)
|
||||
# this is now opaque to processing
|
||||
e.meta[:raw_code] = children.join('').it_was_a_code_block
|
||||
e
|
||||
}
|
||||
|
||||
# Search for `single tick` code markers
|
||||
span.match_couple_of('`') { |children, match1, match2|
|
||||
e = create_md_element(:inline_code)
|
||||
# this is now opaque to processing
|
||||
e.meta[:raw_code] = children.join('').it_was_a_code_block
|
||||
# this is now opaque to processing
|
||||
e
|
||||
}
|
||||
|
||||
# Detect any immediate link: <http://www.google.com>
|
||||
# we expect an http: or something: at the beginning
|
||||
span.map_match( /<(\w+:[^\>]+)>/) { |match|
|
||||
url = match[1]
|
||||
|
||||
e = create_md_element(:immediate_link, [])
|
||||
e.meta[:url] = url
|
||||
e
|
||||
}
|
||||
|
||||
# Search for inline HTML (the support is pretty basic for now)
|
||||
|
||||
# this searches for a matching block
|
||||
inlineHTML1 = %r{
|
||||
( # put everything in 1
|
||||
< # open
|
||||
(\w+) # opening tag in 2
|
||||
> # close
|
||||
.* # anything
|
||||
</\2> # match closing tag
|
||||
)
|
||||
}x
|
||||
|
||||
# this searches for only one block
|
||||
inlineHTML2 = %r{
|
||||
( # put everything in 1
|
||||
< # open
|
||||
\w+ #
|
||||
# close
|
||||
[^<>]* # anything except
|
||||
/> # closing tag
|
||||
)
|
||||
}x
|
||||
|
||||
for reg in [inlineHTML1, inlineHTML2]
|
||||
span.map_match(reg) { |match|
|
||||
raw_html = match[1]
|
||||
convert_raw_html_in_list(raw_html)
|
||||
}
|
||||
end
|
||||
|
||||
# Detect footnotes references: [^1]
|
||||
span.map_match(/\[(\^[^\]]+)\]/) { |match|
|
||||
id = match[1].strip.downcase
|
||||
e = create_md_element(:footnote_reference)
|
||||
e.meta[:footnote_id] = id
|
||||
e
|
||||
}
|
||||
|
||||
# Detect any image like ![Alt text][url]
|
||||
span.map_match(/\!\[([^\]]+)\]\s?\[([^\]]*)\]/) { |match|
|
||||
alt = match[1]
|
||||
id = match[2].strip.downcase
|
||||
|
||||
if id.size == 0
|
||||
id = text.strip.downcase
|
||||
end
|
||||
|
||||
e = create_md_element(:image)
|
||||
e.meta[:ref_id] = id
|
||||
e
|
||||
}
|
||||
|
||||
# Detect any immage with immediate url: 
|
||||
# a dummy ref is created and put in the symbol table
|
||||
link1 = /!\[([^\]]+)\]\s?\(([^\s\)]*)(?:\s+["'](.*)["'])?\)/
|
||||
span.map_match(link1) { |match|
|
||||
alt = match[1]
|
||||
url = match[2]
|
||||
title = match[3]
|
||||
|
||||
url = url.strip
|
||||
# create a dummy id
|
||||
id="dummy_#{@refs.size}"
|
||||
@refs[id] = {:url=>url, :title=>title}
|
||||
|
||||
e = create_md_element(:image)
|
||||
e.meta[:ref_id] = id
|
||||
e
|
||||
}
|
||||
|
||||
# an id reference: "[id]", "[ id ]"
|
||||
reg_id_ref = %r{
|
||||
\[ # opening bracket
|
||||
([^\]]*) # 0 or more non-closing bracket (this is too permissive)
|
||||
\] # closing bracket
|
||||
}x
|
||||
|
||||
|
||||
# validates a url, only $1 is set to the url
|
||||
reg_url =
|
||||
/((?:\w+):\/\/(?:\w+:{0,1}\w*@)?(?:\S+)(?::[0-9]+)?(?:\/|\/([\w#!:.?+=&%@!\-\/]))?)/
|
||||
reg_url = %r{([^\s\]\)]+)}
|
||||
|
||||
# A string enclosed in quotes.
|
||||
reg_title = %r{
|
||||
" # opening
|
||||
[^"]* # anything = 1
|
||||
" # closing
|
||||
}x
|
||||
|
||||
# [bah](http://www.google.com "Google.com"),
|
||||
# [bah](http://www.google.com),
|
||||
# [empty]()
|
||||
reg_url_and_title = %r{
|
||||
\( # opening
|
||||
\s* # whitespace
|
||||
#{reg_url}? # url = 1 might be empty
|
||||
(?:\s+["'](.*)["'])? # optional title = 2
|
||||
\s* # whitespace
|
||||
\) # closing
|
||||
}x
|
||||
|
||||
# Detect a link like ![Alt text][id]
|
||||
span.map_match(/\[([^\]]+)\]\s?\[([^\]]*)\]/) { |match|
|
||||
text = match[1]
|
||||
id = match[2].strip.downcase
|
||||
|
||||
if id.size == 0
|
||||
id = text.strip.downcase
|
||||
end
|
||||
|
||||
children = parse_lines_as_span(text)
|
||||
e = create_md_element(:link, children)
|
||||
e.meta[:ref_id] = id
|
||||
e
|
||||
}
|
||||
|
||||
# Detect any immage with immediate url: 
|
||||
# a dummy ref is created and put in the symbol table
|
||||
link1 = /!\[([^\]]+)\]\s?\(([^\s\)]*)(?:\s+["'](.*)["'])?\)/
|
||||
span.map_match(link1) { |match|
|
||||
text = match[1]
|
||||
children = parse_lines_as_span(text)
|
||||
|
||||
url = match[2]
|
||||
title = match[3]
|
||||
|
||||
url = url.strip
|
||||
# create a dummy id
|
||||
id="dummy_#{@refs.size}"
|
||||
@refs[id] = {:url=>url, :title=>title}
|
||||
@refs[id][:title] = title if title
|
||||
|
||||
e = create_md_element(:link, children)
|
||||
e.meta[:ref_id] = id
|
||||
e
|
||||
}
|
||||
|
||||
|
||||
# Detect any link like [Google engine][google]
|
||||
span.match_couple_of('[', # opening bracket
|
||||
%r{\] # closing bracket
|
||||
[ ]? # optional whitespace
|
||||
#{reg_id_ref} # ref id, with $1 being the reference
|
||||
}x
|
||||
) { |children, match1, match2|
|
||||
id = match2[1]
|
||||
id = id.strip.downcase
|
||||
|
||||
if id.size == 0
|
||||
id = children.join.strip.downcase
|
||||
end
|
||||
|
||||
e = create_md_element(:link, children)
|
||||
e.meta[:ref_id] = id
|
||||
e
|
||||
}
|
||||
|
||||
# Detect any link with immediate url: [Google](http://www.google.com)
|
||||
# XXX Note that the url can be empty: [Empty]()
|
||||
# a dummy ref is created and put in the symbol table
|
||||
span.match_couple_of('[', # opening bracket
|
||||
%r{\] # closing bracket
|
||||
[ ]? # optional whitespace
|
||||
#{reg_url_and_title} # ref id, with $1 being the url and $2 being the title
|
||||
}x
|
||||
) { |children, match1, match2|
|
||||
|
||||
url = match2[1]
|
||||
title = match2[3] # XXX? Is it a bug? I would use [2]
|
||||
|
||||
# create a dummy id
|
||||
id="dummy_#{@refs.size}"
|
||||
@refs[id] = {:url=>url}
|
||||
@refs[id][:title] = title if title
|
||||
|
||||
e = create_md_element(:link, children)
|
||||
e.meta[:ref_id] = id
|
||||
e
|
||||
}
|
||||
|
||||
# Detect an email address <andrea@invalid.it>
|
||||
span.map_match(EMailAddress) { |match|
|
||||
email = match[1]
|
||||
e = create_md_element(:email_address, [])
|
||||
e.meta[:email] = email
|
||||
e
|
||||
}
|
||||
|
||||
# Detect HTML entitis
|
||||
span.map_match(/&([\w\d]+);/) { |match|
|
||||
entity_name = match[1]
|
||||
|
||||
e = create_md_element(:entity, [])
|
||||
e.meta[:entity_name] = entity_name
|
||||
e
|
||||
}
|
||||
|
||||
|
||||
# And now the easy stuff
|
||||
|
||||
# search for ***strong and em***
|
||||
span.match_couple_of('***') { |children,m1,m2|
|
||||
create_md_element(:strong, [create_md_element(:emphasis, children)] ) }
|
||||
|
||||
span.match_couple_of('___') { |children,m1,m2|
|
||||
create_md_element(:strong, [create_md_element(:emphasis, children)] ) }
|
||||
|
||||
# search for **strong**
|
||||
span.match_couple_of('**') { |children,m1,m2| create_md_element(:strong, children) }
|
||||
|
||||
# search for __strong__
|
||||
span.match_couple_of('__') { |children,m1,m2| create_md_element(:strong, children) }
|
||||
|
||||
# search for *emphasis*
|
||||
span.match_couple_of('*') { |children,m1,m2| create_md_element(:emphasis, children) }
|
||||
|
||||
# search for _emphasis_
|
||||
span.match_couple_of('_') { |children,m1,m2| create_md_element(:emphasis, children) }
|
||||
|
||||
# finally, unescape the special characters
|
||||
span.replace_each_string { |s| s.unescape_md_special}
|
||||
|
||||
span.children
|
||||
end
|
||||
|
||||
# returns array containing Strings or :linebreak elements
|
||||
def resolve_linebreaks(lines)
|
||||
res = []
|
||||
s = ""
|
||||
lines.each do |l|
|
||||
s += (s.size>0 ? " " : "") + l.strip
|
||||
if force_linebreak?(l)
|
||||
res << s
|
||||
res << create_md_element(:linebreak)
|
||||
s = ""
|
||||
end
|
||||
end
|
||||
res << s if s.size > 0
|
||||
res
|
||||
end
|
||||
|
||||
# raw_html is something like
|
||||
# <em> A</em> dopwkk *maruk* <em>A</em>
|
||||
def convert_raw_html_in_list(raw_html)
|
||||
e = create_md_element(:raw_html)
|
||||
e.meta[:raw_html] = raw_html
|
||||
begin
|
||||
e.meta[:parsed_html] = Document.new(raw_html)
|
||||
rescue
|
||||
$stderr.puts "convert_raw_html_in_list Malformed HTML:\n#{raw_html}"
|
||||
end
|
||||
e
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
# And now the black magic that makes the part above so elegant
|
||||
class MDElement
|
||||
|
||||
# Try to match the regexp to each string in the hierarchy
|
||||
# (using `replace_each_string`). If the regexp match, eliminate
|
||||
# the matching string and substitute it with the pre_match, the
|
||||
# result of the block, and the post_match
|
||||
#
|
||||
# ..., matched_string, ... -> ..., pre_match, block.call(match), post_match
|
||||
#
|
||||
# the block might return arrays.
|
||||
#
|
||||
def map_match(regexp, &block)
|
||||
replace_each_string { |s|
|
||||
processed = []
|
||||
while (match = regexp.match(s))
|
||||
# save the pre_match
|
||||
processed << match.pre_match if match.pre_match && match.pre_match.size>0
|
||||
# transform match
|
||||
result = block.call(match)
|
||||
# and append as processed
|
||||
[*result].each do |e| processed << e end
|
||||
# go on with the rest of the string
|
||||
s = match.post_match
|
||||
end
|
||||
processed << s if s.size > 0
|
||||
processed
|
||||
}
|
||||
end
|
||||
|
||||
# Finds couple of delimiters in a hierarchy of Strings and MDElements
|
||||
#
|
||||
# Open and close are two delimiters (like '[' and ']'), or two Regexp.
|
||||
#
|
||||
# If you don't pass close, it defaults to open.
|
||||
#
|
||||
# Each block is called with |contained children, match1, match2|
|
||||
def match_couple_of(open, close=nil, &block)
|
||||
close = close || open
|
||||
open_regexp = open.kind_of?(Regexp) ? open : Regexp.new(Regexp.escape(open))
|
||||
close_regexp = close.kind_of?(Regexp) ? close : Regexp.new(Regexp.escape(close))
|
||||
|
||||
# Do the same to children first
|
||||
for c in @children; if c.kind_of? MDElement
|
||||
c.match_couple_of(open_regexp, close_regexp, &block)
|
||||
end end
|
||||
|
||||
processed_children = []
|
||||
|
||||
until @children.empty?
|
||||
c = @children.shift
|
||||
if c.kind_of? String
|
||||
match1 = open_regexp.match(c)
|
||||
if not match1
|
||||
processed_children << c
|
||||
else # we found opening, now search closing
|
||||
# puts "Found opening (#{marker}) in #{c.inspect}"
|
||||
# pre match is processed
|
||||
processed_children.push match1.pre_match if
|
||||
match1.pre_match && match1.pre_match.size > 0
|
||||
# we will process again the post_match
|
||||
@children.unshift match1.post_match if
|
||||
match1.post_match && match1.post_match.size>0
|
||||
|
||||
contained = []; found_closing = false
|
||||
until @children.empty? || found_closing
|
||||
c = @children.shift
|
||||
if c.kind_of? String
|
||||
match2 = close_regexp.match(c)
|
||||
if not match2
|
||||
contained << c
|
||||
else
|
||||
# we found closing
|
||||
found_closing = true
|
||||
# pre match is contained
|
||||
contained.push match2.pre_match if
|
||||
match2.pre_match && match2.pre_match.size>0
|
||||
# we will process again the post_match
|
||||
@children.unshift match2.post_match if
|
||||
match2.post_match && match2.post_match.size>0
|
||||
|
||||
# And now we call the block
|
||||
substitute = block.call(contained, match1, match2)
|
||||
processed_children << substitute
|
||||
|
||||
# puts "Found closing (#{marker}) in #{c.inspect}"
|
||||
# puts "Children: #{contained.inspect}"
|
||||
# puts "Substitute: #{substitute.inspect}"
|
||||
end
|
||||
else
|
||||
contained << c
|
||||
end
|
||||
end
|
||||
|
||||
if not found_closing
|
||||
# $stderr.puts "##### Could not find closing for #{open}, #{close} -- ignoring"
|
||||
processed_children << match1.to_s
|
||||
contained.reverse.each do |c|
|
||||
@children.unshift c
|
||||
end
|
||||
end
|
||||
end
|
||||
else
|
||||
processed_children << c
|
||||
end
|
||||
end
|
||||
|
||||
raise "BugBug" unless @children.empty?
|
||||
|
||||
rebuilt = []
|
||||
# rebuild strings
|
||||
processed_children.each do |c|
|
||||
if c.kind_of?(String) && rebuilt.last && rebuilt.last.kind_of?(String)
|
||||
rebuilt.last << c
|
||||
else
|
||||
rebuilt << c
|
||||
end
|
||||
end
|
||||
@children = rebuilt
|
||||
end
|
||||
end
|
Loading…
Add table
Add a link
Reference in a new issue