/*
*                          Copyright (c) 2001
*                             Megan Gentry
*                          All Rights Reserved
*                  Commercial Distribution Prohibited
*
* This software may be  freely copied  and used in its entirety for any
* purpose  so long as the above copyright notice and these comments are
* preserved  in  the  source  form  of  this  software, and  the binary
* copyright is preserved in any image built from it.
*
* The author has used best efforts in the research, design, development
* and  testing of  this software.  The author  makes no warranty of any
* kind,  expressed or  implied,  with  regard to  this software and its
* suitability for a given application.  The author shall not  be liable
* in any  event for  incidental or  consequential damages in connection
* with, or arising out of, the use or performance of this software. Use
* of this software constitutes acceptance of these terms.
*
* The author  is committed to making a best effort at fixing any errors
* found  in the  software and  would welcome  any reports  of problems,
* comments  or suggestions  regarding the software.   Please send email
* to <[email protected]>.
*/


/*
* Abstract and Edit History
*
* dmextract
*      This program was designed to read the contents of a DECmail-11
*      .MAI file (an indexed file used on RSX and RSTS machines) and
*      to extract the messages so that they can then be read using
*      standard U*x mail utilities.  Input is from the specified file,
*      while output is to standard output so that it can either be
*      filtered or redirected to an output file.
*
* Edit History:
*
* (000) 07-Dec-2001 Megan Gentry
*      Final coding so that the program dumps most of my message
*      files.  The contents of the file it writes are still not
*      properly processed by U*x mail(x) programs -- they concatenate
*      some messages (even though all messages are written in
*      exactly the same sequence... maybe there is a problem with
*      header or message content).
*
* (001) 07-Dec-2001 Megan Gentry
*      Okay, the 'From' line appears to need more than one field
*      on the line for the message to be properly recognized.  A
*      temporary workaround has been to output something else (like
*      the date) for those 'From' lines with only one field.
*      Now the problem is that some lines (notably those from
*      VMS hosts) have names or witty sayings within quotes in
*      the from field, which display improperly using U*x mail.
*
*/


#include <stdio.h>
#include <string.h>

#define BLOCK   (512)

char copyright[] = "dmextract.c, Copyright (c) 2001 by Megan Gentry";

typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned long ulong;

int debug = 0;

/*
* Message descriptor structure
*/

struct msginfo {
   struct msginfo *next;               /* Pointer to next record */
   struct msginfo *prev;               /* Pointer to previous record */
                                       /*   not really neaded */
   ulong number;                       /* Message number */
   struct {
       ulong   spos;                   /* File position for header start */
       ulong   epos;                   /* File position for header end */
       char    from[256];              /* From record from header */
       char    date[256];              /* Date record from header */
       char    to[256];                /* To record from header */
       char    cc[256];                /* Cc record from header */
       char    subject[256];           /* Subj record from header */
       char    mid[256];               /* For duplicate date (?) from header */
   } header;
   struct {
       ulong   spos;                   /* File position for text start */
       ulong   epos;                   /* File position for text end */
       ulong   lines;                  /* Count of lines of text */
   } text;
};

/*
* Message descriptor list head structure
*/

struct msghead {
   struct msginfo *head;
   struct msginfo *tail;
   ulong count;
};

struct msghead msglist;                 /* Head of message descriptor list */

char cuspname[132];                     /* Execution name of this program */
char mailfile[132];                     /* Name of mailfile to be referenced */

uchar buffer[BLOCK];


unsigned short
getushort(fp)
   FILE *fp;
{
   char sbuf[2];
   short lo, hi;

   fread (sbuf, 2, 1, fp);

   lo = sbuf[0] & 0377;
   hi = sbuf[1] & 0377;
   return (hi * 0400 + lo);
}


main (argc, argv)
   int argc;
   char *argv[];
{
   FILE *fopen(), *fp;
   int i, j, n;
   int reclen;
   char ch;
   uchar c1, c2, c3, c4;
   int hdrfg, hdrct;
   int txtfg, txtct, prtfg;
   ushort *wordp;
   struct msginfo *msgp;

   /* Save the cusp name */
   strcpy (cuspname, argv[0]);

   /* Assume the mail.mai file */
   strcpy (mailfile, "mail.mai");

   if (argc == 2)
       strcpy (mailfile, argv[1]);
   else if (argc > 2) {
       fprintf (stderr, "usage: %s [mail_file]\n", cuspname);
       exit (1);
   }

   /*
    * In case file was specified without ".mai" filetype, append it
    */
   if (rindex(mailfile,'.') == NULL) {
       strcat (mailfile, ".mai");
       fprintf (stderr, "warning (0): using file %s\n", mailfile);
   }

   /* open the mail file */
   if ((fp = fopen(mailfile, "rb")) == NULL) {
       fprintf (stderr, "%s: file not found %s\n", cuspname, mailfile);
       exit (1);
   }


   /*
    * In this first pass, we scan the file to find the start and end
    * of all the message headers and text blocks in the file and build
    * a linked list of messages with that information.
    */
   bzero (msglist, sizeof(struct msghead));

   hdrfg = hdrct = 0;
   txtfg = txtct = 0;

   /* Preload the character pipe-line */
   c1 = getc (fp) & 0377;
   c2 = getc (fp) & 0377;
   c3 = getc (fp) & 0377;

   do {
       /* Get the next character from the file */
       c4 = getc (fp) & 0377;

       /*
        * See if the current four characters delineate a
        * message header
        */
       if ( ((c2 * 0400) + c1) == 01 && ((c4 * 0400) + c3) == 0377) {
           if (hdrfg == 0) {
               /* We've located the start of a message header */
               hdrfg = 1;
               hdrct++;

               if (txtfg) {
                   fprintf (stderr, "warning (1): message header found while processing message %d text block\n", txtct);
                   txtfg = 0;
               }

               /* Allocate a new message descriptor */
               msgp = (struct msginfo *) malloc (sizeof(struct msginfo));
               if (!msgp) {
                   fprintf (stderr, "fatal (1): unable to allocate memory for message %d\n", hdrct);
                   exit (1);
               }

               /* Initialize descriptor */
               bzero (msgp, sizeof(struct msginfo));
               msgp->number = hdrct;
               msgp->header.spos = ftell(fp)-4;

               /* Link the new descriptor into the list */
               if (msglist.head) {
                   msgp->prev = msglist.tail;
                   (msglist.tail)->next = msgp;
                   msglist.tail = msgp;
               } else {
                   msglist.head = msgp;
                   msglist.tail = msgp;
               }

               /* Keep track of number of messages */
               msglist.count++;
           } else {
               /* We've located the end of the message header */
               msgp->header.epos = ftell(fp)-4;
               hdrfg = 0;
           }
       }

       /*
        * See if the current four characters delineate a
        * text block
        */
       if ( ((c2 * 0400) + c1) == 01 && ((c4 * 0400) + c3) == 0204) {
           if (txtfg == 0) {
               /* We've located the beginning of the message */
               txtfg = 1;
               txtct++;
               msgp->text.spos = ftell(fp)-4;
           } else {
               /* We've located the end of the message */
               msgp->text.epos = ftell(fp)-4;
               txtfg = 0;
           }
       }

       /* The pipe-line shifts */
       c1 = c2;
       c2 = c3;
       c3 = c4;
   } while (!feof(fp));

   /*
    * Verify that we have the same number of headers as text blocks
    */
   if (hdrct != txtct) {
       fprintf (stderr, "warning (1): %d headers, %d text blocks\n",
                       hdrct, txtct);
   }

   fprintf (stderr, "info (1): %s appears to contain %d %s\n",
               mailfile, msglist.count,
               msglist.count == 1 ? "message" : "messages");


   /*
    * In this pass, we extract message header information to fill
    * each entry in our message descriptor list.
    */
   for (msgp = msglist.head; msgp != NULL; msgp=msgp->next) {
       fseek (fp, msgp->header.spos+4, SEEK_SET);
       while (ftell(fp) < msgp->header.epos
                       && (reclen = getushort(fp)) != 01) {
           fread (buffer, ((reclen+1)&~1), 1, fp);
           reclen = reclen <= 255 ? reclen : 255;
           buffer[reclen] = 0;
           switch ((uchar)buffer[0]) {
             case 0231:                /* Undocumented field type */
                                       /* Appears to duplicate date */
               strcpy (&msgp->header.mid[0], &buffer[1]);
               if (strlen(&msgp->header.date[0]) == 0)
                   strcpy (&msgp->header.date[0], &msgp->header.mid[0]);
               break;
             case 0201:                /* From: field */
               strcpy (&msgp->header.from[0], &buffer[1]);
               break;
             case 0202:                /* Date: field */
               strcpy (&msgp->header.date[0], &buffer[1]);
               break;
             case 0205:                /* To: field */
               strcpy (&msgp->header.to[0], &buffer[1]);
               break;
             case 0206:                /* Cc: field */
               strcpy (&msgp->header.cc[0], &buffer[1]);
               break;
             case 0207:                /* Subject: field */
               strcpy (&msgp->header.subject[0], &buffer[1]);
               break;
           }
       }
   }


   /*
    * In this pass, we process the counted records comprising the
    * message to 1) count the number of lines of text so we can
    * report it, and 2) so that we can ensure that processing
    * continues to the end of a message block.
    */

   /*
    * NOTE:
    *  It appears that different versions of DECmail did different
    *  things with regard to the text records.  I believe they are
    *  are supposed to be counted records, two bytes count, with
    *  n bytes of text, padded with null bytes to account for
    *  odd counts.  The problem is that this is not what I found in
    *  practice in all cases.  And in some cases, the counts are
    *  fine up to some point, and then they are simply wrong.
    *  So, combining the algorithm I use for determining line length
    *  with a recovery algorithm when the next line's record count
    *  looks wrong, this seems to work best.  At the worst, I've
    *  noticed that sometimes, the final character of a line will
    *  be lost for all lines of a message, but not all lines of
    *  all messages in that same file.
    */
   for (msgp = msglist.head; msgp != NULL; msgp=msgp->next) {
       int linct = 0;
       fseek (fp, msgp->text.spos+4, SEEK_SET);
       while (ftell(fp) < msgp->text.epos) {
           reclen = getushort(fp);
           linct++;
           if (reclen > 255) {
               fprintf (stderr, "warning (3): Message %04d, Line %04d, filpos 0%012lo, reclen %05d\n",
                       msgp->number, linct, ftell(fp)-2, reclen);
               fprintf (stderr, "info (3): adjusting file position\n");
               fseek (fp, ftell(fp)-1, SEEK_SET);
               --linct;
               continue;
           }
           if (reclen == 0) continue;
           if (reclen & 01)
               fread (buffer, reclen, 1, fp);
           else
               fread (buffer, reclen-1, 1, fp);
           msgp->text.lines++;
       }
       if (ftell(fp) == msgp->text.epos) continue;
       fprintf (stderr, "error (3): While processing message %d:\n",
                       msgp->number);
       fprintf (stderr, "  Processing ended at file position 0%012lo\n",
                       ftell(fp));
       fprintf (stderr, "   Text block ends at file position 0%012lo\n",
                       msgp->text.epos);
   }


   /*
    * In this pass, we finally start outputting the information
    * obtained from the headers, followed by the associated
    * message text.  Output is done in such a way that it can
    * be read by U*x standard mail utilities (hopefully).
    */
   for (msgp = msglist.head; msgp != NULL; msgp=msgp->next) {

       /*
        * Output the 'From' field.  If there is only one
        * field in the text, add the date as a second field
        * on the line so that U*x mail utilities can properly
        * identify starts of messages
        */
       if (index(msgp->header.from,' '))
           printf ("From %s\n", &msgp->header.from[0]);
       else
           printf ("From %s %s\n",
                       &msgp->header.from[0],
                       &msgp->header.date[0]);

       printf ("From: %s\n", &msgp->header.from[0]);

       /* There should always be a recipient specified */
       printf ("To: %s\n", &msgp->header.to[0]);

       /* There doesn't always have to be a CC list */
       if (strlen(&msgp->header.cc[0]))
           printf ("Cc: %s\n", &msgp->header.cc[0]);

       /* There always has to be a date */
       printf ("Date: %s\n", &msgp->header.date[0]);

       /* There doesn't always have to be a subject */
       if (strlen(&msgp->header.subject[0]))
           printf ("Subject: %s\n", &msgp->header.subject[0]);

       /* Let's assume the mail files were read at some point */
       printf ("Status: RO\n");

       /* This is dmextract-specific info for debugging */
       printf ("DECmail-Info: %s\n", &msgp->header.mid[0]);
       printf ("DMextract-Number: %d\n", msgp->number);
       printf ("DMline-Count: %d\n", msgp->text.lines);

       /* Separate header from text with a blank line */
       printf ("\n");

       /* Now we start outputting the message text */
       fseek (fp, msgp->text.spos+4, SEEK_SET);
       while (ftell(fp) < msgp->text.epos) {
           reclen = getushort(fp);
           if (reclen > 255) {
               fseek (fp, ftell(fp)-1, SEEK_SET);
               continue;
           }
           if (reclen == 0) continue;
           if (reclen & 01)
               fread (buffer, reclen, 1, fp);
           else
               fread (buffer, reclen-1, 1, fp);
           buffer[reclen] = 0;
           if (strncmp(&buffer[0],"From",4) == 0)
               printf (">");
           for (i = 0; i < reclen-1; i++) {
               if (buffer[i] == 012) break;
               if (buffer[i] == 015) {
                   (void) getc (fp);
                   continue;
               }
               printf ("%c", buffer[i]);
           }
           printf ("\n");
       }
       printf ("\n");
   }
}