#!/usr/bin/env perl
# txt2pre --- convert my site's txt files to `pre'-based atom/rss/html

# Copyright (C) 2014-2021 all contributors <[email protected]>
# Copyright (c) 2021 Amin Bandali <[email protected]>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

# This simple script borrows from a script of the same name from the
# wonderful public-inbox project, under AGPLv3+, with additions of
# my own.

# Update (2021-11-01): this script isn't currently used for generating
# my site's pages anymore; but kept for future reference.


use strict;
use warnings 'all';
use Getopt::Long;

my $format = 'html';
my $lang = 'en';
my $index = '';
my $header = '';
my $footer = '';

GetOptions ('format=s' => \$format,
           'lang=s' => \$lang,
           'index' => \$index,
           'header' => \$header,
           'footer' => \$footer)
   or die("bad command line arguments\n");

my $author =
   $lang eq 'en' ? 'bandali'
   : $lang eq 'fa' ? 'بندعلی'
   : '';
my $site_title =
   $lang eq 'en' ? "${author}'s personal site"
   : $lang eq 'fa' ? "سایت شخصی $author"
   : '';
my $site_desc =
   $lang eq 'en' ? "notes and blog posts by $author"
   : $lang eq 'fa' ? "نوشته‌ها و بلاگ پست‌های $author"
   : '';
my $site_url =
   ($lang eq 'en') ? 'https://bndl.org'
   : ($lang eq 'fa') ? 'https://bndl.org/fa/'
   : '';
my $feed_id =
   ($lang eq 'en') ? "tag:bndl.org,2020:notes.$format"
   : ($lang eq 'fa') ? "tag:bndl.org,2020:fa/notes.$format"
   : '';

my $link_re =
   qr{([\('!])?\b((?:ftps?|https?|nntps?|imaps?|s?news|gopher)://
      [\@:\w\.-]+(?:/
        (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*)
        (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)?
        (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)?
      )?
   )}xi;

my %pairs = (
   "(" => qr/(\)[\.,;\+]?)\z/, # Markdown (,), Ruby (+) (, for arrays)
   "'" => qr/('[\.,;\+]?)\z/, # Perl / Ruby
   "!" => qr/(![\.,;\+]?)\z/, # Perl / Ruby
);

my %html_map = (
   '&' => '&amp;',
   '<' => '&lt;',
   '>' => '&gt;',
   # '"' => '&quot;',
   # "'" => '&#39;',
);

sub html_esc {
   my ($s) = @_;
   $s =~ s/([&<>])/$html_map{$1}/sge;
   $s;
}

sub linkify {
   my ($s) = @_;
   $s =~ s^$link_re^
       my $beg = $1 || '';
       my $url = $2;
       my $end = '';

       # it's fairly common to end URLs in messages with
       # '.', ',' or ';' to denote the end of a statement;
       # assume the intent was to end the statement/sentence
       # in English
       if (defined(my $re = $pairs{$beg})) {
           if ($url =~ s/$re//) {
               $end = $1;
           }
       } elsif ($url =~ s/(\))?([\.,;])\z//) {
           $end = $2;
           # require ')' to be paired with '('
           if (defined $1) { # ')'
               if (index($url, '(') < 0) {
                   $end = ")$end";
               } else {
                   $url .= ')';
               }
           }
       } elsif ($url !~ /\(/ && $url =~ s/\)\z//) {
           $end = ')';
       }

       $beg . "<a href=\"$url\">$url</a>" . $end;
   ^geo;
   $s;
}


my $out = '';

# atom/rss feed header and footer
if ($index and ($format eq 'atom' or $format eq 'rss')) {
   if ($header) {
       my $now_iso8601 = `date -Iseconds -u | tr -d \\\\n`;
       my $now_rfc5322 = `date -uR | tr -d \\\\n`;
       my $atom_rel = $format eq 'atom' ? 'self' : 'alternate';
       my $rss_rel = $format eq 'rss' ? 'self' : 'alternate';
       my $link = $format eq 'atom' ? 'link' : 'atom:link';
       my $links = '';
       if ($lang eq 'en') {
           $links = qq(
<$link hreflang="fa" href="https://bndl.org/fa/notes.atom" rel="alternate" type="application/atom+xml" />
<$link hreflang="fa" href="https://bndl.org/fa/notes.rss" rel="alternate" type="application/rss+xml" />
<$link hreflang="fa" href="https://bndl.org/fa/bandali.fa.txt" rel="alternate" type="text/plain" />
<$link hreflang="fa" href="https://bndl.org/fa/" rel="alternate" type="text/html" />
<$link href="https://bndl.org/notes.atom" rel="$atom_rel" type="application/atom+xml" />
<$link href="https://bndl.org/notes.rss" rel="$rss_rel" type="application/rss+xml" />
<$link href="https://bndl.org/bandali.txt" rel="alternate" type="text/plain" />
<$link href="https://bndl.org" rel="alternate" type="text/html" />);
       } elsif ($lang eq 'fa') {
           $links = qq(
<link hreflang="en" href="https://bndl.org/notes.atom" rel="alternate" type="application/atom+xml" />
<link hreflang="en" href="https://bndl.org/notes.rss" rel="alternate" type="application/rss+xml" />
<link hreflang="en" href="https://bndl.org/bandali.txt" rel="alternate" type="text/plain" />
<link hreflang="en" href="https://bndl.org" rel="alternate" type="text/html" />
<link href="https://bndl.org/fa/notes.atom" rel="$atom_rel" type="application/atom+xml" />
<link href="https://bndl.org/fa/notes.rss" rel="$rss_rel" type="application/rss+xml" />
<link href="https://bndl.org/fa/bandali.fa.txt" rel="alternate" type="text/plain" />
<link href="https://bndl.org/fa/" rel="alternate" type="text/html" />);
       }
       $links =~ s/^\n//;

       $out .= '<?xml version="1.0" encoding="UTF-8" ?>';
       $out .= ($format eq 'atom') ? qq(
<feed xml:lang="$lang" xmlns="http://www.w3.org/2005/Atom">
<title>$site_title</title>
<subtitle>$site_desc</subtitle>
<id>$feed_id</id>
$links
<updated>$now_iso8601</updated>)
           : ($format eq 'rss') ? qq(
<rss version="2.0"
    xmlns:atom="http://www.w3.org/2005/Atom"
    xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title>$site_title</title>
<description>$site_desc</description>
<link>$site_url</link>
<language>$lang</language>
<lastBuildDate>$now_rfc5322</lastBuildDate>
<pubDate>$now_rfc5322</pubDate>
<ttl>1800</ttl>
$links)
           : '';
   } elsif ($footer) {
       $out .= ($format eq 'atom') ? '</feed>'
           : ($format eq 'rss') ? '</channel></rss>'
           : '';
   }

   # we're done
   goto PRINT;
}


my $txt = do { local $/; <STDIN> };

my $title = html_esc($txt =~ /\A([^\n]+)/);
$title =~ s/^\s+|\s+$//g;
$title .= " &#8212; $author" if $title !~ /$author/;

my ($upd, $pub, $url) = $txt =~ /(.*)\r?\n(.*)\r?\n(.*)\r?\n?\z/;
($upd) = $upd =~ /(?:updated|ویرایش): (.*)/ if $upd;
($pub) = $pub =~ /(?:published|انتشار): (.*)/ if $pub;
$upd = $pub if (!$upd);
($url) = $url =~ /(?:plain text|متن ساده): (.*)/ if $url;
$url = 'https://bndl.org/bandali-cv.txt'
   if (!$url and $title =~ /curriculum vitae/);
$url = html_esc($url) if $url;

$txt = linkify(html_esc($txt));


my $upd_iso8601 = `date -Iseconds -ud '$upd' | tr -d \\\\n` if $upd;
my $pub_iso8601 = `date -Iseconds -ud '$pub' | tr -d \\\\n` if $pub;
my $pub_rfc5322 = `date -uRd '$pub' | tr -d \\\\n` if $pub;
my $url_html = $url =~ s/(?:[.]$lang)?[.]txt$/.html/r if $url;
$url_html =~ s|/bandali-(.*)|/$1| if $url_html;
my $slug = $url_html =~ s|.*/(.*)[.]html$|$1|r if $url_html;
my $note_id = "$feed_id:$slug" if $url_html;

# note header
if ($format eq 'html') {
   $out .=
       '<!doctype html>'
       . qq(<html lang="$lang") . ($lang eq 'fa'
                                   ? ' dir="rtl"'
                                   : '' . '>')
       . qq(<head>
<meta http-equiv="Content-Type"
content="text/html; charset=utf-8" />\n)
       . "<title>$title</title>\n"
       . qq(<link rel="icon" href="data:,">\n)
       . ($url
          ? qq(<link rel="alternate" href="$url"
title="plain text" type="text/plain" />\n)
          : '')
       . (($index and $lang eq 'en')
          ? qq(<link rel="alternate" href="https://bndl.org/fa/"
hreflang="fa" title="persian" />\n)
          : ($index and $lang eq 'fa')
          ? qq(<link rel="alternate" href="https://bndl.org/"
hreflang="en" title="english" />\n)
          : '')
       . qq(<style>\@media(prefers-color-scheme:dark){
body{background:#1c1c1c;color:white;}a:link{color:#acdeff;}
a:visited{color:#f8f;}a:active{color:#e00;}})
       . ($lang eq 'fa'
          ? qq(\n\@font-face{font-family:sahel;font-weight:normal;
src:local('Sahel WOL'),local('Sahel'),
url('sahel.woff2')format('woff2');}pre{font-family:sahel})
          : '')
       . "</style>\n"
       . '</head><body><pre>';
} elsif ($format eq 'atom' or $format eq 'rss') {
   my $atom_updated =
       ($format eq 'atom') ? 'updated'
       : ($format eq 'rss') ? 'atom:updated'
       : '';
   my $updated =
       "<$atom_updated>$upd_iso8601</$atom_updated>\n" if $upd;
   $out .= ($format eq 'atom') ? qq(
<entry xml:base="$site_url">
<author><name>$author</name></author>
<id>$note_id</id>
<published>$pub_iso8601</published>\n)
($updated ? $updated : '') .
qq(<link href="$url" rel="alternate" type="text/plain" />
<link href="$url_html" rel="alternate" type="text/html" />
<title>$title</title>
<content type="html"><![CDATA[<pre>)
       : ($format eq 'rss') ? qq(
<item>
<title>$title</title>
<link>$url_html</link>
<guid isPermaLink="false">$note_id</guid>
<pubDate>$pub_rfc5322</pubDate>\n)
(($updated and $pub ne $upd) ? $updated : '') .
qq(<content:encoded><![CDATA[<pre>)
       : '';
}
# note body
$out .= $txt;
# note footer
if ($format eq 'html') {
   $out .= '</pre></body></html>';
} elsif ($format eq 'atom') {
   $out .= "</pre>]]></content></entry>";
} elsif ($format eq 'rss') {
   $out .= "</pre>]]></content:encoded></item>";
}

PRINT:
print("$out\n");
STDOUT->flush;