# Copyright (C) 2006 Andrea Censi # # This file is part of Maruku. # # Maruku is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # Maruku is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Maruku; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # NOTE: this is the old span-level regexp-based parser. # # The new parser is a real parser and is defined with functions in parse_span_better.rb # The new parser is faster, handles syntax errors, but it's absolutely not readable. # # Also, regexp parsers simply CANNOT handle inline HTML properly. # There are two black-magic methods `match_couple_of` and `map_match`, # defined at the end of the file, that make the function # `parse_lines_as_span` so elegant. class Maruku # Takes care of all span-level formatting, links, images, etc. # # Lines must not contain block-level elements. def parse_lines_as_span(lines) # first, get rid of linebreaks res = resolve_linebreaks(lines) span = MDElement.new(:dummy, res) # encode all escapes span.replace_each_string { |s| s.escape_md_special } # The order of processing is significant: # 1. inline code # 2. immediate links # 3. inline HTML # 4. everything else # search for ``code`` markers span.match_couple_of('``') { |children, match1, match2| e = create_md_element(:inline_code) # this is now opaque to processing e.meta[:raw_code] = children.join('').it_was_a_code_block e } # Search for `single tick` code markers span.match_couple_of('`') { |children, match1, match2| e = create_md_element(:inline_code) # this is now opaque to processing e.meta[:raw_code] = children.join('').it_was_a_code_block # this is now opaque to processing e } # Detect any immediate link: # we expect an http: or something: at the beginning span.map_match( /<(\w+:[^\>]+)>/) { |match| url = match[1] e = create_md_element(:immediate_link, []) e.meta[:url] = url e } # Search for inline HTML (the support is pretty basic for now) # this searches for a matching block inlineHTML1 = %r{ ( # put everything in 1 < # open (\w+) # opening tag in 2 > # close .* # anything # match closing tag ) }x # this searches for only one block inlineHTML2 = %r{ ( # put everything in 1 < # open \w+ # # close [^<>]* # anything except /> # closing tag ) }x for reg in [inlineHTML1, inlineHTML2] span.map_match(reg) { |match| raw_html = match[1] convert_raw_html_in_list(raw_html) } end # Detect footnotes references: [^1] span.map_match(/\[(\^[^\]]+)\]/) { |match| id = match[1].strip.downcase e = create_md_element(:footnote_reference) e.meta[:footnote_id] = id e } # Detect any image like ![Alt text][url] span.map_match(/\!\[([^\]]+)\]\s?\[([^\]]*)\]/) { |match| alt = match[1] id = match[2].strip.downcase if id.size == 0 id = text.strip.downcase end e = create_md_element(:image) e.meta[:ref_id] = id e } # Detect any immage with immediate url: ![Alt](url "title") # a dummy ref is created and put in the symbol table link1 = /!\[([^\]]+)\]\s?\(([^\s\)]*)(?:\s+["'](.*)["'])?\)/ span.map_match(link1) { |match| alt = match[1] url = match[2] title = match[3] url = url.strip # create a dummy id id="dummy_#{@refs.size}" @refs[id] = {:url=>url, :title=>title} e = create_md_element(:image) e.meta[:ref_id] = id e } # an id reference: "[id]", "[ id ]" reg_id_ref = %r{ \[ # opening bracket ([^\]]*) # 0 or more non-closing bracket (this is too permissive) \] # closing bracket }x # validates a url, only $1 is set to the url reg_url = /((?:\w+):\/\/(?:\w+:{0,1}\w*@)?(?:\S+)(?::[0-9]+)?(?:\/|\/([\w#!:.?+=&%@!\-\/]))?)/ reg_url = %r{([^\s\]\)]+)} # A string enclosed in quotes. reg_title = %r{ " # opening [^"]* # anything = 1 " # closing }x # [bah](http://www.google.com "Google.com"), # [bah](http://www.google.com), # [empty]() reg_url_and_title = %r{ \( # opening \s* # whitespace #{reg_url}? # url = 1 might be empty (?:\s+["'](.*)["'])? # optional title = 2 \s* # whitespace \) # closing }x # Detect a link like ![Alt text][id] span.map_match(/\[([^\]]+)\]\s?\[([^\]]*)\]/) { |match| text = match[1] id = match[2].strip.downcase if id.size == 0 id = text.strip.downcase end children = parse_lines_as_span(text) e = create_md_element(:link, children) e.meta[:ref_id] = id e } # Detect any immage with immediate url: ![Alt](url "title") # a dummy ref is created and put in the symbol table link1 = /!\[([^\]]+)\]\s?\(([^\s\)]*)(?:\s+["'](.*)["'])?\)/ span.map_match(link1) { |match| text = match[1] children = parse_lines_as_span(text) url = match[2] title = match[3] url = url.strip # create a dummy id id="dummy_#{@refs.size}" @refs[id] = {:url=>url, :title=>title} @refs[id][:title] = title if title e = create_md_element(:link, children) e.meta[:ref_id] = id e } # Detect any link like [Google engine][google] span.match_couple_of('[', # opening bracket %r{\] # closing bracket [ ]? # optional whitespace #{reg_id_ref} # ref id, with $1 being the reference }x ) { |children, match1, match2| id = match2[1] id = id.strip.downcase if id.size == 0 id = children.join.strip.downcase end e = create_md_element(:link, children) e.meta[:ref_id] = id e } # Detect any link with immediate url: [Google](http://www.google.com) # XXX Note that the url can be empty: [Empty]() # a dummy ref is created and put in the symbol table span.match_couple_of('[', # opening bracket %r{\] # closing bracket [ ]? # optional whitespace #{reg_url_and_title} # ref id, with $1 being the url and $2 being the title }x ) { |children, match1, match2| url = match2[1] title = match2[3] # XXX? Is it a bug? I would use [2] # create a dummy id id="dummy_#{@refs.size}" @refs[id] = {:url=>url} @refs[id][:title] = title if title e = create_md_element(:link, children) e.meta[:ref_id] = id e } # Detect an email address span.map_match(EMailAddress) { |match| email = match[1] e = create_md_element(:email_address, []) e.meta[:email] = email e } # Detect HTML entitis span.map_match(/&([\w\d]+);/) { |match| entity_name = match[1] e = create_md_element(:entity, []) e.meta[:entity_name] = entity_name e } # And now the easy stuff # search for ***strong and em*** span.match_couple_of('***') { |children,m1,m2| create_md_element(:strong, [create_md_element(:emphasis, children)] ) } span.match_couple_of('___') { |children,m1,m2| create_md_element(:strong, [create_md_element(:emphasis, children)] ) } # search for **strong** span.match_couple_of('**') { |children,m1,m2| create_md_element(:strong, children) } # search for __strong__ span.match_couple_of('__') { |children,m1,m2| create_md_element(:strong, children) } # search for *emphasis* span.match_couple_of('*') { |children,m1,m2| create_md_element(:emphasis, children) } # search for _emphasis_ span.match_couple_of('_') { |children,m1,m2| create_md_element(:emphasis, children) } # finally, unescape the special characters span.replace_each_string { |s| s.unescape_md_special} span.children end # returns array containing Strings or :linebreak elements def resolve_linebreaks(lines) res = [] s = "" lines.each do |l| s += (s.size>0 ? " " : "") + l.strip if force_linebreak?(l) res << s res << create_md_element(:linebreak) s = "" end end res << s if s.size > 0 res end # raw_html is something like # A dopwkk *maruk* A def convert_raw_html_in_list(raw_html) e = create_md_element(:raw_html) e.meta[:raw_html] = raw_html begin e.meta[:parsed_html] = Document.new(raw_html) rescue $stderr.puts "convert_raw_html_in_list Malformed HTML:\n#{raw_html}" end e end end # And now the black magic that makes the part above so elegant class MDElement # Try to match the regexp to each string in the hierarchy # (using `replace_each_string`). If the regexp match, eliminate # the matching string and substitute it with the pre_match, the # result of the block, and the post_match # # ..., matched_string, ... -> ..., pre_match, block.call(match), post_match # # the block might return arrays. # def map_match(regexp, &block) replace_each_string { |s| processed = [] while (match = regexp.match(s)) # save the pre_match processed << match.pre_match if match.pre_match && match.pre_match.size>0 # transform match result = block.call(match) # and append as processed [*result].each do |e| processed << e end # go on with the rest of the string s = match.post_match end processed << s if s.size > 0 processed } end # Finds couple of delimiters in a hierarchy of Strings and MDElements # # Open and close are two delimiters (like '[' and ']'), or two Regexp. # # If you don't pass close, it defaults to open. # # Each block is called with |contained children, match1, match2| def match_couple_of(open, close=nil, &block) close = close || open open_regexp = open.kind_of?(Regexp) ? open : Regexp.new(Regexp.escape(open)) close_regexp = close.kind_of?(Regexp) ? close : Regexp.new(Regexp.escape(close)) # Do the same to children first for c in @children; if c.kind_of? MDElement c.match_couple_of(open_regexp, close_regexp, &block) end end processed_children = [] until @children.empty? c = @children.shift if c.kind_of? String match1 = open_regexp.match(c) if not match1 processed_children << c else # we found opening, now search closing # puts "Found opening (#{marker}) in #{c.inspect}" # pre match is processed processed_children.push match1.pre_match if match1.pre_match && match1.pre_match.size > 0 # we will process again the post_match @children.unshift match1.post_match if match1.post_match && match1.post_match.size>0 contained = []; found_closing = false until @children.empty? || found_closing c = @children.shift if c.kind_of? String match2 = close_regexp.match(c) if not match2 contained << c else # we found closing found_closing = true # pre match is contained contained.push match2.pre_match if match2.pre_match && match2.pre_match.size>0 # we will process again the post_match @children.unshift match2.post_match if match2.post_match && match2.post_match.size>0 # And now we call the block substitute = block.call(contained, match1, match2) processed_children << substitute # puts "Found closing (#{marker}) in #{c.inspect}" # puts "Children: #{contained.inspect}" # puts "Substitute: #{substitute.inspect}" end else contained << c end end if not found_closing # $stderr.puts "##### Could not find closing for #{open}, #{close} -- ignoring" processed_children << match1.to_s contained.reverse.each do |c| @children.unshift c end end end else processed_children << c end end raise "BugBug" unless @children.empty? rebuilt = [] # rebuild strings processed_children.each do |c| if c.kind_of?(String) && rebuilt.last && rebuilt.last.kind_of?(String) rebuilt.last << c else rebuilt << c end end @children = rebuilt end end