# setext -> HTML converter

#!/usr/bin/perl -w
# $Id: setext2html.txt,v 1.9 2007/09/08 $
# setext -> HTML converter
#
# (C) 2002 Erik Oliver
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
# USA.

use strict;
use Carp;

# Global variables
my $numargs = @ARGV;
my $infile;
my $outfile;
my %href;
my @toc;
my $tocindex = -1;

sub style_encode($) {
use HTML::Entities;
my $string = shift;
my $flag = 0;

$string = encode_entities($string,"<>&");

# debugging: print the ho-tt matching lines
# if ($string =~ m#\b(\S+)_\b#) {
# print STDERR "IN: ", $string, "\n";
# $flag = 1;
# }

# bold-tt
$string =~ s#\*\*([^\*]*)\*\*#<strong>$1</strong>#g;

# italic-tt
$string =~ s#~(\S*)~#
($a = $1) =~ s,~, ,g; "<em>$a</em>"#eg;

# underline-tt
$string =~ s#\b_(\S*)_\b#
($a = $1) =~ s,_, ,g; "<em>$a</em>"; #eg;

# hot-tt
my $h;
$string =~ s#\b(\S+)_\b#
$h = $href{$1};
($a = $1) =~ s,_, ,g;
$h ? qq'<a href="$h">$a</a>' : qq'$a'; #eg;

# if ($flag == 1) {
# print STDERR "OUT: ", $string, "\n";
# }

return($string);
}

sub encode_only($) {
use HTML::Entities;
my $string = shift;

$string = encode_entities($string,"<>&");

return($string);
}

if ($numargs == 1) {
if($ARGV[0] =~ s/\.etx$// ) {
print STDERR "warning: one argument form used but called with $ARGV[0].etx, argument shortened to $ARGV[0].\n";
}

$infile = "$ARGV[0].etx";
$outfile = "$ARGV[0].html";
} elsif ($numargs == 2) {
$infile = $ARGV[0];
$outfile = $ARGV[1];
} else {
print STDERR "usage: $0 infile.etx outfile.html\n";
print STDERR "usage: $0 inoutfile\n";
exit -1;
}

if (! -e $infile) {
print STDERR "error: Input, $infile, does not exist\n";
exit -1;
}

if (! -r $infile) {
print STDERR "error: Input, $infile, not readable\n";
exit -1;
}

open INFILE, "<$infile" ||
die "Could not open $infile for reading, $!";
my @data = <INFILE>; # slurp input
chomp @data; # strip newlines
close INFILE;

open OUTFILE,">$outfile" ||
die "Could not open $outfile for writing, $!";

## Loop 1: Find any href-tt tags and hash the URL against a key
## [Also escape a literal "`" with +++.
for(my $loop=0; $loop <= @data; $loop++)
{
if(!$data[$loop]) { next; } # skip blank lines

# href-tt finder: .. _href URL
if($data[$loop] =~ /^\.\.\s+_(\S*)\s+(.*)\s*/ ) {
my $key = $1;
my $pos = $loop + 1;
my $value = $2;
$data[$loop] = "";
while ($pos <= $#data) {
# lookahead
if($data[$pos] =~ m/^\.\. ([^_])/) {
$value .= "$1";
$data[$pos] = "";
$pos++;
next;
} else {
last;
}
}
$href{$key} = encode_entities($value,"<>&");
}
$data[$loop] =~ s/``/+++/g;
}

my ($htmltitle, $htmlauthor, $htmldate) = ("","","");

## Loop 2: Find headers and flow paragraphs, etc. together
for(my $loop = 0; $loop <= @data; $loop++)
{
if(!$data[$loop]) {next;} # skip blank lines
$_ = $data[$loop];

if(/^\.\. / ) {
# supress-tt
$data[$loop] = "";
} elsif (/^Subject: (.*$)/) {
if($htmltitle eq "") {
$htmltitle = style_encode($1);
}
$data[$loop] = "";
} elsif (/^From: (.*$)/) {
if($htmlauthor eq "") {
$htmlauthor = $1;
}
$data[$loop] = "";
} elsif (/^Date: (.*$)/) {
if($htmldate eq "") {
$htmldate = $1;
}
$data[$loop] = "";
} elsif (/^===/) {
if($htmltitle eq "") {
$htmltitle = style_encode($data[$loop-1]);
}
$tocindex++;
my $headline = style_encode($data[$loop-1]);
my $id = "sh$tocindex";
$toc[$tocindex] = "1:$headline";

$data[$loop-1] = "<h1 id=\"$id\">$headline</h1>\n";
$data[$loop] = "";
} elsif (/^---/) {
if($htmltitle eq "") {
$htmltitle = style_encode($data[$loop-1]);
}
$tocindex++;
my $headline = style_encode($data[$loop-1]);
my $id = "sh$tocindex";
$toc[$tocindex] = "2:$headline";

$data[$loop-1] = "<h2 id=\"$id\">$headline</h2>\n";
$data[$loop] = "";
} elsif (/^\s{0,2}\* /) {
# this is a list
$data[$loop] =~ s/^\s*\* //;
$data[$loop] = "<ul><li>" . style_encode($data[$loop]);
my $pos = $loop+1;
while($pos <= $#data) {
if($data[$pos] =~ s/^\s{0,2}\* //) {
$data[$loop] .= "</li>\n<li>" . style_encode($data[$pos]);
$data[$pos] = "";
$pos++;
} elsif ($data[$pos] =~ s/^ ([^ *])/$1/) {
$data[$loop] .= " " . style_encode($data[$pos]);
$data[$pos] = "";
$pos++;
} else {
last;
}
}
$data[$loop] .= "</li></ul>\n";
} elsif (/^\s{0,2}>\s*/) {
# this is included text with a ">"
$data[$loop] =~ s/^\s{0,2}>\s*//;
$data[$loop] = style_encode($data[$loop]);
my $pos = $loop+1;
while($pos <= $#data) {
if($data[$pos] =~ s/^\s{0,2}>\s*//) {
if($data[$pos] eq "") {
$data[$loop] .= " <br />";
} else {
$data[$loop] .= " ". style_encode($data[$pos]);
$data[$pos] = "";
$pos++;
}
} else {
last;
}
}

$data[$loop] = "<blockquote>" . $data[$loop] . "</blockquote>\n";
} elsif (/^ [^ ]/) {
# this is body text to wrap up
$data[$loop] =~ s/^ //;
$data[$loop] = style_encode($data[$loop]);
my $pos = $loop+1;
while($pos <= $#data) {
if($data[$pos] =~ s/^ ([^ >])/$1/) {
$data[$loop] .= " " . style_encode($data[$pos]);
$data[$pos] ="";
$pos++;
} else {
last;
}
}
$data[$loop] = "<p>" . $data[$loop] . "</p>\n";
} elsif (/`/) {
# if nothing else and there is a ` assume multiline
# verbatim environment
$data[$loop] =~ s/`//;
$data[$loop] = encode_only($data[$loop]);
my $pos = $loop+1;
while($pos<= $#data) {
if($data[$pos] =~ s/`//) {
if($data[$pos] ne '') {
$data[$loop] .= "\n". encode_only($data[$pos]);
$data[$pos] = "";
}
last;
} else {
$data[$loop] .= "\n" . encode_only($data[$pos]);
$data[$pos] = "";
$pos++;
}
}
$data[$loop] = "<pre>" . $data[$loop] . "</pre>\n";
} elsif (/\$\$/) {
$data[$loop] = "<span class=\"twobuck-tt\">" . $data[$loop] . "</span>";
} elsif (/^\s*$/) {
$data[$loop] = ""; # effectively a blank line
} else {
# should only be here if at end or next line is the === or ---
next if ($loop == $#data);
next if ($data[$loop+1] =~ /^===/);
next if ($data[$loop+1] =~ /^---/);

carp "Unhandled typotag, line = $loop, \"$_\"\n";
}
}

print OUTFILE qq|
<html>
<head><title>$htmltitle</title>
<style type="text/css">
body { margin: 0; padding: 0; font-family: Verdana, sans-serif; color: #000; background: #ffd; }
content { padding: 10px 20px; margin: 5% 10%; border: 1px dotted #333; background: #fff; }
twobuck-tt {color: #040; }
p { margin-top: 0; padding-top: 0; line-height: 1.5em, margin-bottom: 1em; }
h1 {font-size: 2em; }
h2 {font-size: 1.75em; }
h1, h2 { margin: 1.25em 0 0 0; font-family: Verdana, sans-serif; }
pre { font: Courier, "Courier New", monospace; }
a { text-decoration:none; font-weight:600; }
a:hover {background-color:#eee;}

</style>
</head>
<body>
<div class="content">
|;

# header information
print OUTFILE "<h1>$htmltitle</h1>\n" if ($htmltitle);
print OUTFILE "<p>By ",encode_only($htmlauthor),"</p>\n" if ($htmlauthor);
print OUTFILE "<p>$htmldate</p>\n" if ($htmldate);

# table of contents?
print OUTFILE "<h1>Table of Contents</h1>\n";
print OUTFILE "<ul>\n";
my $lastlevel = 0;
for(my $loop = 0; $loop <= $tocindex; $loop++)
{
$_ = $toc[$loop];
m#([^:]*):(.*)#;
my $level = $1;
my $content = $2;

next if $level == 2;

if($lastlevel == 0) {
# first entry
print OUTFILE qq|<li><a href="#sh$loop">$content</a>|;
$lastlevel = $level;
} elsif ($lastlevel == $level) {
# stayed same, end prior entry start new one
print OUTFILE qq|</li>\n<li><a href="#sh$loop">$content</a>|;
} elsif ($level == 2 && $lastlevel == 1) {
# starting new level
print OUTFILE qq|\n<ul>\n<li><a href="#sh$loop">$content</a>|;
$lastlevel = $level;
} elsif ($level == 1 && $lastlevel == 2) {
# finishing a level
print OUTFILE qq|</li>\n</ul>\n</li>\n<li><a href="#sh$loop">$content</a>|;
$lastlevel = $level;
} else {
print STDERR "toc error lastlevel vs level mismatch\n";
die;
}
}
print OUTFILE "</li></ul>\n";
if ($lastlevel == 2) { print OUTFILE "</li></ul>\n"; }

for(my $loop = 0; $loop <= @data; $loop++)
{
if(!$data[$loop]) {next;} # skip blank lines
$data[$loop] =~ s/\+\+\+/`/g;
print OUTFILE "$data[$loop]\n";
}

print OUTFILE qq|
</div>
</body></html>
|;

close OUTFILE;