s around - # "paragraphs" that are wrapped in non-block-level tags, such as anchors, - # phrase emphasis, and spans. The list of tags we're looking for is - # hard-coded: - my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/; - my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/; - - # First, look for nested blocks, e.g.: - #
tags.
-# my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
-
- foreach my $cur_token (@$tokens) {
- if ($cur_token->[0] eq "tag") {
- # Within tags, encode * and _ so they don't conflict
- # with their use in Markdown for italics and strong.
- # We're replacing each such character with its
- # corresponding MD5 checksum value; this is likely
- # overkill, but it should prevent us from colliding
- # with the escape values by accident.
- $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx;
- $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx;
- $text .= $cur_token->[1];
- } else {
- my $t = $cur_token->[1];
- $t = _EncodeBackslashEscapes($t);
- $text .= $t;
- }
- }
- return $text;
-}
-
-
-sub _DoAnchors {
-#
-# Turn Markdown link shortcuts into XHTML tags.
-#
- my $text = shift;
-
- #
- # First, handle reference-style links: [link text] [id]
- #
- $text =~ s{
- ( # wrap whole match in $1
- \[
- ($g_nested_brackets) # link text = $2
- \]
-
- [ ]? # one optional space
- (?:\n[ ]*)? # one optional newline followed by spaces
-
- \[
- (.*?) # id = $3
- \]
- )
- }{
- my $result;
- my $whole_match = $1;
- my $link_text = $2;
- my $link_id = lc $3;
-
- if ($link_id eq "") {
- $link_id = lc $link_text; # for shortcut links like [this][].
- }
-
- if (defined $g_urls{$link_id}) {
- my $url = $g_urls{$link_id};
- $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
- $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
- $result = "? # href = $3
- [ \t]*
- ( # $4
- (['"]) # quote char = $5
- (.*?) # Title = $6
- \5 # matching quote
- )? # title is optional
- \)
- )
- }{
- my $result;
- my $whole_match = $1;
- my $link_text = $2;
- my $url = $3;
- my $title = $6;
-
- $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
- $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
- $result = " tags.
-#
- my $text = shift;
-
- #
- # First, handle reference-style labeled images: ![alt text][id]
- #
- $text =~ s{
- ( # wrap whole match in $1
- !\[
- (.*?) # alt text = $2
- \]
-
- [ ]? # one optional space
- (?:\n[ ]*)? # one optional newline followed by spaces
-
- \[
- (.*?) # id = $3
- \]
-
- )
- }{
- my $result;
- my $whole_match = $1;
- my $alt_text = $2;
- my $link_id = lc $3;
-
- if ($link_id eq "") {
- $link_id = lc $alt_text; # for shortcut links like ![this][].
- }
-
- $alt_text =~ s/"/"/g;
- if (defined $g_urls{$link_id}) {
- my $url = $g_urls{$link_id};
- $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
- $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
- $result = "
? # src url = $3
- [ \t]*
- ( # $4
- (['"]) # quote char = $5
- (.*?) # title = $6
- \5 # matching quote
- [ \t]*
- )? # title is optional
- \)
- )
- }{
- my $result;
- my $whole_match = $1;
- my $alt_text = $2;
- my $url = $3;
- my $title = '';
- if (defined($6)) {
- $title = $6;
- }
-
- $alt_text =~ s/"/"/g;
- $title =~ s/"/"/g;
- $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
- $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
- $result = "
" . _RunSpanGamut($1) . "\n\n";
- }egmx;
-
- $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
- "" . _RunSpanGamut($1) . "
\n\n";
- }egmx;
-
-
- # atx-style headers:
- # # Header 1
- # ## Header 2
- # ## Header 2 with closing hashes ##
- # ...
- # ###### Header 6
- #
- $text =~ s{
- ^(\#{1,6}) # $1 = string of #'s
- [ \t]*
- (.+?) # $2 = Header text
- [ \t]*
- \#* # optional closing #'s (not counted)
- \n+
- }{
- my $h_level = length($1);
- "" . _RunSpanGamut($2) . " \n\n";
- }egmx;
-
- return $text;
-}
-
-
-sub _DoLists {
-#
-# Form HTML ordered (numbered) and unordered (bulleted) lists.
-#
- my $text = shift;
- my $less_than_tab = $g_tab_width - 1;
-
- # Re-usable patterns to match list item bullets and number markers:
- my $marker_ul = qr/[*+-]/;
- my $marker_ol = qr/\d+[.]/;
- my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
-
- # Re-usable pattern to match any entirel ul or ol list:
- my $whole_list = qr{
- ( # $1 = whole list
- ( # $2
- [ ]{0,$less_than_tab}
- (${marker_any}) # $3 = first list item marker
- [ \t]+
- )
- (?s:.+?)
- ( # $4
- \z
- |
- \n{2,}
- (?=\S)
- (?! # Negative lookahead for another list item marker
- [ \t]*
- ${marker_any}[ \t]+
- )
- )
- )
- }mx;
-
- # We use a different prefix before nested lists than top-level lists.
- # See extended comment in _ProcessListItems().
- #
- # Note: There's a bit of duplication here. My original implementation
- # created a scalar regex pattern as the conditional result of the test on
- # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
- # substitution once, using the scalar as the pattern. This worked,
- # everywhere except when running under MT on my hosting account at Pair
- # Networks. There, this caused all rebuilds to be killed by the reaper (or
- # perhaps they crashed, but that seems incredibly unlikely given that the
- # same script on the same server ran fine *except* under MT. I've spent
- # more time trying to figure out why this is happening than I'd like to
- # admit. My only guess, backed up by the fact that this workaround works,
- # is that Perl optimizes the substition when it can figure out that the
- # pattern will never change, and when this optimization isn't on, we run
- # afoul of the reaper. Thus, the slightly redundant code to that uses two
- # static s/// patterns rather than one conditional pattern.
-
- if ($g_list_level) {
- $text =~ s{
- ^
- $whole_list
- }{
- my $list = $1;
- my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
- # Turn double returns into triple returns, so that we can make a
- # paragraph for the last item in a list, if necessary:
- $list =~ s/\n{2,}(?! {8,})/\n\n\n/g;
- my $result = _ProcessListItems($list, $marker_any);
- $result = "<$list_type>\n" . $result . "$list_type>\n";
- $result;
- }egmx;
- }
- else {
- $text =~ s{
- (?:(?<=\n\n)|\A\n?)
- $whole_list
- }{
- my $list = $1;
- my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
- # Turn double returns into triple returns, so that we can make a
- # paragraph for the last item in a list, if necessary:
- $list =~ s/\n{2,}(?! {8,})/\n\n\n/g;
- my $result = _ProcessListItems($list, $marker_any);
- $result = "<$list_type>\n" . $result . "$list_type>\n";
- $result;
- }egmx;
- }
-
-
- return $text;
-}
-
-
-sub _ProcessListItems {
-#
-# Process the contents of a single ordered or unordered list, splitting it
-# into individual list items.
-#
-
- my $list_str = shift;
- my $marker_any = shift;
-
-
- # The $g_list_level global keeps track of when we're inside a list.
- # Each time we enter a list, we increment it; when we leave a list,
- # we decrement. If it's zero, we're not in a list anymore.
- #
- # We do this because when we're not inside a list, we want to treat
- # something like this:
- #
- # I recommend upgrading to version
- # 8. Oops, now this line is treated
- # as a sub-list.
- #
- # As a single paragraph, despite the fact that the second line starts
- # with a digit-period-space sequence.
- #
- # Whereas when we're inside a list (or sub-list), that line will be
- # treated as the start of a sub-list. What a kludge, huh? This is
- # an aspect of Markdown's syntax that's hard to parse perfectly
- # without resorting to mind-reading. Perhaps the solution is to
- # change the syntax rules such that sub-lists must start with a
- # starting cardinal number; e.g. "1." or "a.".
-
- $g_list_level++;
-
- # trim trailing blank lines:
- $list_str =~ s/\n{2,}\z/\n/;
-
-
- $list_str =~ s{
- (\n)? # leading line = $1
- (^[ \t]*) # leading whitespace = $2
- ($marker_any) [ \t]+ # list marker = $3
- ((?s:.+?) # list item text = $4
- (\n{1,2}))
- (?= \n* (\z | \2 ($marker_any) [ \t]+))
- }{
- my $item = $4;
- my $leading_line = $1;
- my $leading_space = $2;
-
- if ($leading_line or ($item =~ m/\n{2,}/)) {
- $item = _RunBlockGamut(_Outdent($item));
- }
- else {
- # Recursion for sub-lists:
- $item = _DoLists(_Outdent($item));
- chomp $item;
- $item = _RunSpanGamut($item);
- }
-
- "" . $item . " \n";
- }egmx;
-
- $g_list_level--;
- return $list_str;
-}
-
-
-
-sub _DoCodeBlocks {
-#
-# Process Markdown `` blocks.
-#
-
- my $text = shift;
-
- $text =~ s{
- (?:\n\n|\A)
- ( # $1 = the code block -- one or more lines, starting with a space/tab
- (?:
- (?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces
- .*\n+
- )+
- )
- ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
- }{
- my $codeblock = $1;
- my $result; # return value
-
- $codeblock = _EncodeCode(_Outdent($codeblock));
- $codeblock = _Detab($codeblock);
- $codeblock =~ s/\A\n+//; # trim leading newlines
- $codeblock =~ s/\s+\z//; # trim trailing whitespace
-
- $result = "\n\n" . $codeblock . "\n
\n\n";
-
- $result;
- }egmx;
-
- return $text;
-}
-
-
-sub _DoCodeSpans {
-#
-# * Backtick quotes are used for
spans.
-#
-# * You can use multiple backticks as the delimiters if you want to
-# include literal backticks in the code span. So, this input:
-#
-# Just type ``foo `bar` baz`` at the prompt.
-#
-# Will translate to:
-#
-# Just type foo `bar` baz
at the prompt.
-#
-# There's no arbitrary limit to the number of backticks you
-# can use as delimters. If you need three consecutive backticks
-# in your code, use four for delimiters, etc.
-#
-# * You can use spaces to get literal backticks at the edges:
-#
-# ... type `` `bar` `` ...
-#
-# Turns to:
-#
-# ... type `bar`
...
-#
-
- my $text = shift;
-
- $text =~ s@
- (`+) # $1 = Opening run of `
- (.+?) # $2 = The code block
- (?$c
";
- @egsx;
-
- return $text;
-}
-
-
-sub _EncodeCode {
-#
-# Encode/escape certain characters inside Markdown code runs.
-# The point is that in code, these characters are literals,
-# and lose their special Markdown meanings.
-#
- local $_ = shift;
-
- # Encode all ampersands; HTML entities are not
- # entities within a Markdown code span.
- s/&/&/g;
-
- # Encode $'s, but only if we're running under Blosxom.
- # (Blosxom interpolates Perl variables in article bodies.)
- {
- no warnings 'once';
- if (defined($blosxom::version)) {
- s/\$/$/g;
- }
- }
-
-
- # Do the angle bracket song and dance:
- s! < !<!gx;
- s! > !>!gx;
-
- # Now, escape characters that are magic in Markdown:
- s! \* !$g_escape_table{'*'}!gx;
- s! _ !$g_escape_table{'_'}!gx;
- s! { !$g_escape_table{'{'}!gx;
- s! } !$g_escape_table{'}'}!gx;
- s! \[ !$g_escape_table{'['}!gx;
- s! \] !$g_escape_table{']'}!gx;
- s! \\ !$g_escape_table{'\\'}!gx;
-
- return $_;
-}
-
-
-sub _DoItalicsAndBold {
- my $text = shift;
-
- # must go first:
- $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
- {$2}gsx;
-
- $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
- {$2}gsx;
-
- return $text;
-}
-
-
-sub _DoBlockQuotes {
- my $text = shift;
-
- $text =~ s{
- ( # Wrap whole match in $1
- (
- ^[ \t]*>[ \t]? # '>' at the start of a line
- .+\n # rest of the first line
- (.+\n)* # subsequent consecutive lines
- \n* # blanks
- )+
- )
- }{
- my $bq = $1;
- $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
- $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
- $bq = _RunBlockGamut($bq); # recurse
-
- $bq =~ s/^/ /g;
- # These leading spaces screw with content, so we need to fix that:
- $bq =~ s{
- (\s*.+?
)
- }{
- my $pre = $1;
- $pre =~ s/^ //mg;
- $pre;
- }egsx;
-
- "\n$bq\n
\n\n";
- }egmx;
-
-
- return $text;
-}
-
-
-sub _FormParagraphs {
-#
-# Params:
-# $text - string to process with html tags
-#
- my $text = shift;
-
- # Strip leading and trailing lines:
- $text =~ s/\A\n+//;
- $text =~ s/\n+\z//;
-
- my @grafs = split(/\n{2,}/, $text);
-
- #
- # Wrap
tags.
- #
- foreach (@grafs) {
- unless (defined( $g_html_blocks{$_} )) {
- $_ = _RunSpanGamut($_);
- s/^([ \t]*)/
/;
- $_ .= "
";
- }
- }
-
- #
- # Unhashify HTML blocks
- #
- foreach (@grafs) {
- if (defined( $g_html_blocks{$_} )) {
- $_ = $g_html_blocks{$_};
- }
- }
-
- return join "\n\n", @grafs;
-}
-
-
-sub _EncodeAmpsAndAngles {
-# Smart processing for ampersands and angle brackets that need to be encoded.
-
- my $text = shift;
-
- # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
- # http://bumppo.net/projects/amputator/
- $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g;
-
- # Encode naked <'s
- $text =~ s{<(?![a-z/?\$!])}{<}gi;
-
- return $text;
-}
-
-
-sub _EncodeBackslashEscapes {
-#
-# Parameter: String.
-# Returns: The string, with after processing the following backslash
-# escape sequences.
-#
- local $_ = shift;
-
- s! \\\\ !$g_escape_table{'\\'}!gx; # Must process escaped backslashes first.
- s! \\` !$g_escape_table{'`'}!gx;
- s! \\\* !$g_escape_table{'*'}!gx;
- s! \\_ !$g_escape_table{'_'}!gx;
- s! \\\{ !$g_escape_table{'{'}!gx;
- s! \\\} !$g_escape_table{'}'}!gx;
- s! \\\[ !$g_escape_table{'['}!gx;
- s! \\\] !$g_escape_table{']'}!gx;
- s! \\\( !$g_escape_table{'('}!gx;
- s! \\\) !$g_escape_table{')'}!gx;
- s! \\> !$g_escape_table{'>'}!gx;
- s! \\\# !$g_escape_table{'#'}!gx;
- s! \\\+ !$g_escape_table{'+'}!gx;
- s! \\\- !$g_escape_table{'-'}!gx;
- s! \\\. !$g_escape_table{'.'}!gx;
- s{ \\! }{$g_escape_table{'!'}}gx;
-
- return $_;
-}
-
-
-sub _DoAutoLinks {
- my $text = shift;
-
- $text =~ s{<((https?|ftp):[^'">\s]+)>}{$1}gi;
-
- # Email addresses:
- $text =~ s{
- <
- (?:mailto:)?
- (
- [-.\w]+
- \@
- [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
- )
- >
- }{
- _EncodeEmailAddress( _UnescapeSpecialChars($1) );
- }egix;
-
- return $text;
-}
-
-
-sub _EncodeEmailAddress {
-#
-# Input: an email address, e.g. "foo@example.com"
-#
-# Output: the email address as a mailto link, with each character
-# of the address encoded as either a decimal or hex entity, in
-# the hopes of foiling most address harvesting spam bots. E.g.:
-#
-# foo
-# @example.com
-#
-# Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
-# mailing list:
-#
-
- my $addr = shift;
-
- srand;
- my @encode = (
- sub { '' . ord(shift) . ';' },
- sub { '' . sprintf( "%X", ord(shift) ) . ';' },
- sub { shift },
- );
-
- $addr = "mailto:" . $addr;
-
- $addr =~ s{(.)}{
- my $char = $1;
- if ( $char eq '@' ) {
- # this *must* be encoded. I insist.
- $char = $encode[int rand 1]->($char);
- } elsif ( $char ne ':' ) {
- # leave ':' alone (to spot mailto: later)
- my $r = rand;
- # roughly 10% raw, 45% hex, 45% dec
- $char = (
- $r > .9 ? $encode[2]->($char) :
- $r < .45 ? $encode[1]->($char) :
- $encode[0]->($char)
- );
- }
- $char;
- }gex;
-
- $addr = qq{$addr};
- $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
-
- return $addr;
-}
-
-
-sub _UnescapeSpecialChars {
-#
-# Swap back in all the special characters we've hidden.
-#
- my $text = shift;
-
- while( my($char, $hash) = each(%g_escape_table) ) {
- $text =~ s/$hash/$char/g;
- }
- return $text;
-}
-
-
-sub _TokenizeHTML {
-#
-# Parameter: String containing HTML markup.
-# Returns: Reference to an array of the tokens comprising the input
-# string. Each token is either a tag (possibly with nested,
-# tags contained therein, such as , or a
-# run of text between tags. Each element of the array is a
-# two-element array; the first is either 'tag' or 'text';
-# the second is the actual value.
-#
-#
-# Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
-#
-#
-
- my $str = shift;
- my $pos = 0;
- my $len = length $str;
- my @tokens;
-
- my $depth = 6;
- my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
- my $match = qr/(?s: ) | # comment
- (?s: <\? .*? \?> ) | # processing instruction
- $nested_tags/ix; # nested tags
-
- while ($str =~ m/($match)/g) {
- my $whole_tag = $1;
- my $sec_start = pos $str;
- my $tag_start = $sec_start - length $whole_tag;
- if ($pos < $tag_start) {
- push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
- }
- push @tokens, ['tag', $whole_tag];
- $pos = pos $str;
- }
- push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
- \@tokens;
-}
-
-
-sub _Outdent {
-#
-# Remove one level of line-leading tabs or spaces
-#
- my $text = shift;
-
- $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
- return $text;
-}
-
-
-sub _Detab {
-#
-# Cribbed from a post by Bart Lateur:
-#
-#
- my $text = shift;
-
- $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
- return $text;
-}
-
-
-1;
-
-__END__
-
-
-=pod
-
-=head1 NAME
-
-B
-
-
-=head1 SYNOPSIS
-
-B [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ]
- [ I ... ]
-
-
-=head1 DESCRIPTION
-
-Markdown is a text-to-HTML filter; it translates an easy-to-read /
-easy-to-write structured text format into HTML. Markdown's text format
-is most similar to that of plain text email, and supports features such
-as headers, *emphasis*, code blocks, blockquotes, and links.
-
-Markdown's syntax is designed not as a generic markup language, but
-specifically to serve as a front-end to (X)HTML. You can use span-level
-HTML tags anywhere in a Markdown document, and you can use block level
-HTML tags (like and