instiki/lib/maruku/input/parse_span_better.rb

705 lines
18 KiB
Ruby
Raw Normal View History

2007-01-22 15:36:51 +01:00
#--
# Copyright (C) 2006 Andrea Censi <andrea (at) rubyforge.org>
#
# This file is part of Maruku.
#
# Maruku is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# Maruku is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Maruku; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#++
require 'set'
module MaRuKu; module In; module Markdown; module SpanLevelParser
include MaRuKu::Helpers
EscapedCharInText =
Set.new [?\\,?`,?*,?_,?{,?},?[,?],?(,?),?#,?.,?!,?|,?:,?+,?-,?>]
EscapedCharInQuotes =
Set.new [?\\,?`,?*,?_,?{,?},?[,?],?(,?),?#,?.,?!,?|,?:,?+,?-,?>,?',?"]
EscapedCharInInlineCode = [?\\,?`]
def parse_lines_as_span(lines, parent=nil)
parse_span_better lines.join("\n"), parent
end
def parse_span_better(string, parent=nil)
if not string.kind_of? String then
error "Passed #{string.class}." end
st = (string + "")
st.freeze
src = CharSource.new(st, parent)
read_span(src, EscapedCharInText, [nil])
end
# This is the main loop for reading span elements
#
# It's long, but not *complex* or difficult to understand.
#
#
def read_span(src, escaped, exit_on_chars, exit_on_strings=nil)
con = SpanContext.new
c = d = nil
while true
c = src.cur_char
# This is only an optimization which cuts 50% of the time used.
# (but you can't use a-zA-z in exit_on_chars)
if c && ((c>=?a && c<=?z) || ((c>=?A && c<=?Z)))
con.cur_string << src.shift_char
next
end
break if exit_on_chars && exit_on_chars.include?(c)
break if exit_on_strings && exit_on_strings.any? {|x| src.cur_chars_are x}
# check if there are extensions
if check_span_extensions(src, con)
next
end
case c = src.cur_char
when ?\ # it's space (32)
if src.cur_chars_are " \n"
src.ignore_chars(3)
con.push_element md_br()
next
else
src.ignore_char
con.push_space
end
when ?\n, ?\t
src.ignore_char
con.push_space
when ?`
read_inline_code(src,con)
when ?<
# It could be:
# 1) HTML "<div ..."
# 2) HTML "<!-- ..."
# 3) url "<http:// ", "<ftp:// ..."
# 4) email "<andrea@... ", "<mailto:andrea@..."
# 5) on itself! "a < b "
# 6) Start of <<guillemettes>>
case d = src.next_char
when ?<; # guillemettes
src.ignore_chars(2)
con.push_char ?<
con.push_char ?<
when ?!;
if src.cur_chars_are '<!--'
read_inline_html(src, con)
else
con.push_char src.shift_char
end
when ??
read_xml_instr_span(src, con)
when ?\ , ?\t
con.push_char src.shift_char
else
if src.next_matches(/<mailto:/) or
src.next_matches(/<[\w\.]+\@/)
read_email_el(src, con)
elsif src.next_matches(/<\w+:/)
read_url_el(src, con)
elsif src.next_matches(/<\w/)
#puts "This is HTML: #{src.cur_chars(20)}"
read_inline_html(src, con)
else
#puts "This is NOT HTML: #{src.cur_chars(20)}"
con.push_char src.shift_char
end
end
when ?\\
d = src.next_char
if d == ?'
src.ignore_chars(2)
con.push_element md_entity('apos')
elsif d == ?"
src.ignore_chars(2)
con.push_element md_entity('quot')
elsif escaped.include? d
src.ignore_chars(2)
con.push_char d
else
con.push_char src.shift_char
end
when ?[
if markdown_extra? && src.next_char == ?^
read_footnote_ref(src,con)
else
read_link(src, con)
end
when ?!
if src.next_char == ?[
read_image(src, con)
else
con.push_char src.shift_char
end
when ?&
if m = src.read_regexp(/\&([\w\d]+);/)
con.push_element md_entity(m[1])
else
con.push_char src.shift_char
end
when ?*
if not src.next_char
maruku_error "Opening * as last char.", src, con
maruku_recover "Threating as literal"
con.push_char src.shift_char
else
follows = src.cur_chars(4)
if follows =~ /^\*\*\*[^\s\*]/
con.push_element read_emstrong(src,'***')
elsif follows =~ /^\*\*[^\s\*]/
con.push_element read_strong(src,'**')
elsif follows =~ /^\*[^\s\*]/
con.push_element read_em(src,'*')
else # * is just a normal char
con.push_char src.shift_char
end
end
when ?_
if not src.next_char
maruku_error "Opening _ as last char", src, con
maruku_recover "Threating as literal", src, con
con.push_char src.shift_char
else
follows = src.cur_chars(4)
if follows =~ /^\_\_\_[^\s\_]/
con.push_element read_emstrong(src,'___')
elsif follows =~ /^\_\_[^\s\_]/
con.push_element read_strong(src,'__')
elsif follows =~ /^\_[^\s\_]/
con.push_element read_em(src,'_')
else # _ is just a normal char
con.push_char src.shift_char
end
end
when ?{ # extension
src.ignore_char # {
interpret_extension(src, con, [?}])
src.ignore_char # }
when nil
maruku_error ("Unclosed span (waiting for %s"+
"#{exit_on_strings.inspect})") % [
exit_on_chars ? "#{exit_on_chars.inspect} or" : ""],
src,con
break
else # normal text
con.push_char src.shift_char
end # end case
end # end while true
con.push_string_if_present
# Assign IAL to elements
merge_ial(con.elements, src, con)
# Remove leading space
if (s = con.elements.first).kind_of? String
if s[0] == ?\ then con.elements[0] = s[1, s.size-1] end
con.elements.shift if s.size == 0
end
# Remove final spaces
if (s = con.elements.last).kind_of? String
s.chop! if s[-1] == ?\
con.elements.pop if s.size == 0
end
educated = educate(con.elements)
educated
end
def read_xml_instr_span(src, con)
src.ignore_chars(2) # starting <?
# read target <?target code... ?>
target = if m = src.read_regexp(/(\w+)/)
m[1]
else
''
end
delim = "?>"
code =
read_simple(src, escaped=[], break_on_chars=[],
break_on_strings=[delim])
src.ignore_chars delim.size
code = (code || "").strip
con.push_element md_xml_instr(target, code)
end
# Start: cursor on character **after** '{'
# End: curson on '}' or EOF
def interpret_extension(src, con, break_on_chars)
case src.cur_char
when ?:
src.ignore_char # :
extension_meta(src, con, break_on_chars)
when ?#, ?.
extension_meta(src, con, break_on_chars)
else
stuff = read_simple(src, escaped=[?}], break_on_chars, [])
if stuff =~ /^(\w+\s|[^\w])/
extension_id = $1.strip
if false
else
maruku_recover "I don't know what to do with extension '#{extension_id}'\n"+
"I will threat this:\n\t{#{stuff}} \n as meta-data.\n", src, con
extension_meta(src, con, break_on_chars)
end
else
maruku_recover "I will threat this:\n\t{#{stuff}} \n as meta-data.\n", src, con
extension_meta(src, con, break_on_chars)
end
end
end
def extension_meta(src, con, break_on_chars)
if m = src.read_regexp(/([^:]+):/)
2007-01-22 15:36:51 +01:00
name = m[1]
al = read_attribute_list(src, con, break_on_chars)
# puts "#{name}=#{al.inspect}"
2007-01-22 15:36:51 +01:00
self.doc.ald[name] = al
con.push md_ald(name, al)
else
al = read_attribute_list(src, con, break_on_chars)
self.doc.ald[name] = al
con.push md_ial(al)
end
end
def read_url_el(src,con)
src.ignore_char # leading <
url = read_simple(src, [], [?>])
src.ignore_char # closing >
con.push_element md_url(url)
end
def read_email_el(src,con)
src.ignore_char # leading <
mail = read_simple(src, [], [?>])
src.ignore_char # closing >
address = mail.gsub(/^mailto:/,'')
con.push_element md_email(address)
end
def read_url(src, break_on)
if [?',?"].include? src.cur_char
error 'Invalid char for url', src
end
url = read_simple(src, [], break_on)
if not url # empty url
url = ""
end
if url[0] == ?< && url[-1] == ?>
url = url[1, url.size-2]
end
if url.size == 0
return nil
end
url
end
def read_quoted_or_unquoted(src, con, escaped, exit_on_chars)
case src.cur_char
when ?', ?"
read_quoted(src, con)
else
read_simple(src, escaped, exit_on_chars)
end
end
# Tries to read a quoted value. If stream does not
# start with ' or ", returns nil.
def read_quoted(src, con)
case src.cur_char
when ?', ?"
quote_char = src.shift_char # opening quote
string = read_simple(src, EscapedCharInQuotes, [quote_char])
src.ignore_char # closing quote
return string
else
# puts "Asked to read quote from: #{src.cur_chars(10).inspect}"
return nil
end
end
# Reads a simple string (no formatting) until one of break_on_chars,
# while escaping the escaped.
# If the string is empty, it returns nil.
# Raises on error if the string terminates unexpectedly.
# # If eat_delim is true, and if the delim is not the EOF, then the delim
# # gets eaten from the stream.
def read_simple(src, escaped, exit_on_chars, exit_on_strings=nil)
text = ""
while true
# puts "Reading simple #{text.inspect}"
c = src.cur_char
if exit_on_chars && exit_on_chars.include?(c)
# src.ignore_char if eat_delim
break
end
break if exit_on_strings &&
exit_on_strings.any? {|x| src.cur_chars_are x}
case c
when nil
s= "String finished while reading (break on "+
"#{exit_on_chars.map{|x|""<<x}.inspect})"+
" already read: #{text.inspect}"
maruku_error s, src
maruku_recover "I boldly continue", src
break
when ?\\
d = src.next_char
if escaped.include? d
src.ignore_chars(2)
text << d
else
text << src.shift_char
end
else
text << src.shift_char
end
end
# puts "Read simple #{text.inspect}"
text.empty? ? nil : text
end
def read_em(src, delim)
src.ignore_char
children = read_span(src, EscapedCharInText, nil, [delim])
src.ignore_char
md_em(children)
end
def read_strong(src, delim)
src.ignore_chars(2)
children = read_span(src, EscapedCharInText, nil, [delim])
src.ignore_chars(2)
md_strong(children)
end
def read_emstrong(src, delim)
src.ignore_chars(3)
children = read_span(src, EscapedCharInText, nil, [delim])
src.ignore_chars(3)
md_emstrong(children)
end
SPACE = ?\ # = 32
# R_REF_ID = Regexp.compile(/([^\]\s]*)(\s*\])/)
R_REF_ID = Regexp.compile(/([^\]\s]*)(\s*\])/)
# Reads a bracketed id "[refid]". Consumes also both brackets.
def read_ref_id(src, con)
src.ignore_char # [
src.consume_whitespace
# puts "Next: #{src.cur_chars(10).inspect}"
if m = src.read_regexp(R_REF_ID)
# puts "Got: #{m[1].inspect} Ignored: #{m[2].inspect}"
# puts "Then: #{src.cur_chars(10).inspect}"
m[1]
else
nil
end
end
def read_footnote_ref(src,con)
ref = read_ref_id(src,con)
con.push_element md_foot_ref(ref)
end
def read_inline_html(src, con)
h = HTMLHelper.new
begin
# This is our current buffer in the context
start = src.current_remaining_buffer
h.eat_this start
if not h.is_finished?
error "inline_html: Malformed:\n "+
"#{start.inspect}\n #{h.inspect}",src,con
end
consumed = start.size - h.rest.size
if consumed > 0
con.push_element md_html(h.stuff_you_read)
src.ignore_chars(consumed)
else
puts "HTML helper did not work on #{start.inspect}"
con.push_char src.shift_char
end
rescue Exception => e
maruku_error "Bad html: \n" +
add_tabs(e.inspect+e.backtrace.join("\n"),1,'>'),
src,con
maruku_recover "I will try to continue after bad HTML.", src, con
con.push_char src.shift_char
end
end
def read_inline_code(src, con)
# Count the number of ticks
num_ticks = 0
while src.cur_char == ?`
num_ticks += 1
src.ignore_char
end
# We will read until this string
end_string = "`"*num_ticks
code =
read_simple(src, escaped=[], break_on_chars=[],
break_on_strings=[end_string])
# puts "Now I expects #{num_ticks} ticks: #{src.cur_chars(10).inspect}"
src.ignore_chars num_ticks
# Ignore at most one space
if num_ticks > 1 && code[0] == SPACE
code = code[1, code.size-1]
end
# drop last space
if num_ticks > 1 && code[-1] == SPACE
code = code[0,code.size-1]
end
# puts "Read `` code: #{code.inspect}; after: #{src.cur_chars(10).inspect} "
con.push_element md_code(code)
end
def read_link(src, con)
# we read the string and see what happens
src.ignore_char # opening bracket
children = read_span(src, EscapedCharInText, [?]])
src.ignore_char # closing bracket
# ignore space
if src.cur_char == SPACE and
(src.next_char == ?[ or src.next_char == ?( )
src.shift_char
end
case src.cur_char
when ?(
src.ignore_char # opening (
src.consume_whitespace
url = read_url(src, [SPACE,?\t,?)])
if not url
url = '' # no url is ok
end
src.consume_whitespace
title = nil
if src.cur_char != ?) # we have a title
quote_char = src.cur_char
title = read_quoted(src,con)
if not title
maruku_error 'Must quote title',src,con
else
# Tries to read a title with quotes: ![a](url "ti"tle")
# this is the most ugly thing in Markdown
if not src.next_matches(/\s*\)/)
# if there is not a closing par ), then read
# the rest and guess it's title with quotes
rest = read_simple(src, escaped=[], break_on_chars=[?)],
break_on_strings=[])
# chop the closing char
rest.chop!
title << quote_char << rest
end
end
end
src.consume_whitespace
closing = src.shift_char # closing )
if closing != ?)
maruku_error 'Unclosed link',src,con
maruku_recover "No closing ): I will not create"+
" the link for #{children.inspect}", src, con
con.push_elements children
return
end
con.push_element md_im_link(children,url, title)
when ?[ # link ref
ref_id = read_ref_id(src,con)
if ref_id
2007-01-23 16:26:45 +01:00
if ref_id.size == 0
ref_id = children.to_s.downcase.gsub(' ','_')
else
ref_id = ref_id.downcase
end
2007-01-22 15:36:51 +01:00
con.push_element md_link(children, ref_id)
else
maruku_error "Could not read ref_id", src, con
maruku_recover "I will not create the link for "+
"#{children.inspect}", src, con
con.push_elements children
return
end
else # empty [link]
2007-01-23 16:26:45 +01:00
id = children.to_s.downcase.gsub(' ','_')
con.push_element md_link(children, id)
2007-01-22 15:36:51 +01:00
end
end # read link
def read_image(src, con)
src.ignore_chars(2) # opening "!["
alt_text = read_span(src, EscapedCharInText, [?]])
src.ignore_char # closing bracket
# ignore space
if src.cur_char == SPACE and
(src.next_char == ?[ or src.next_char == ?( )
src.ignore_char
end
case src.cur_char
when ?(
src.ignore_char # opening (
src.consume_whitespace
url = read_url(src, [SPACE,?\t,?)])
if not url
error "Could not read url from #{src.cur_chars(10).inspect}",
src,con
end
src.consume_whitespace
title = nil
if src.cur_char != ?) # we have a title
quote_char = src.cur_char
title = read_quoted(src,con)
if not title
maruku_error 'Must quote title',src,con
else
# Tries to read a title with quotes: ![a](url "ti"tle")
# this is the most ugly thing in Markdown
if not src.next_matches(/\s*\)/)
# if there is not a closing par ), then read
# the rest and guess it's title with quotes
rest = read_simple(src, escaped=[], break_on_chars=[?)],
break_on_strings=[])
# chop the closing char
rest.chop!
title << quote_char << rest
end
end
end
src.consume_whitespace
closing = src.shift_char # closing )
if closing != ?)
error ("Unclosed link: '"<<closing<<"'")+
" Read url=#{url.inspect} title=#{title.inspect}",src,con
end
con.push_element md_im_image(alt_text, url, title)
when ?[ # link ref
ref_id = read_ref_id(src,con)
if ref_id.size == 0
ref_id = alt_text.to_s.downcase.gsub(' ','_')
else
ref_id = ref_id.downcase
end
2007-01-22 15:36:51 +01:00
con.push_element md_image(alt_text, ref_id)
else # no stuff
2007-01-25 00:14:50 +01:00
ref_id = alt_text.to_s.downcase.gsub(' ','_')
con.push_element md_image(alt_text, ref_id)
2007-01-22 15:36:51 +01:00
end
end # read link
class SpanContext
include MaRuKu::Strings
# Read elements
attr_accessor :elements
attr_accessor :cur_string
def initialize
@elements = []
@cur_string = ""
end
def push_element(e)
raise "Only MDElement and String, please. You pushed #{e.class}: #{e.inspect} " if
not (e.kind_of?(String) or e.kind_of?(MDElement))
push_string_if_present
@elements << e
nil
end
alias push push_element
def push_elements(a)
for e in a
if e.kind_of? String
e.each_byte do |b| push_char b end
else
push_element e
end
end
end
def push_string_if_present
if @cur_string.size > 0
@elements << @cur_string
@cur_string = ""
end
nil
end
def push_char(c)
@cur_string << c
nil
end
# push space into current string if
# there isn't one
def push_space
last = @cur_string[@cur_string.size-1]
@cur_string << ?\ if last != ?\
end
def describe
lines = @elements.map{|x| x.inspect}.join("\n")
s = "Elements read in span: \n" +
add_tabs(lines,1, ' -')+"\n"
if @cur_string.size > 0
s += "Current string: \n #{@cur_string.inspect}\n"
end
s
end
end # SpanContext
end end end end # module MaRuKu; module In; module Markdown; module SpanLevelParser