* Copyright (c) 2001

/*
* Copyright (c) 2001
* Megan Gentry
* All Rights Reserved
* Commercial Distribution Prohibited
*
* This software may be freely copied and used in its entirety for any
* purpose so long as the above copyright notice and these comments are
* preserved in the source form of this software, and the binary
* copyright is preserved in any image built from it.
*
* The author has used best efforts in the research, design, development
* and testing of this software. The author makes no warranty of any
* kind, expressed or implied, with regard to this software and its
* suitability for a given application. The author shall not be liable
* in any event for incidental or consequential damages in connection
* with, or arising out of, the use or performance of this software. Use
* of this software constitutes acceptance of these terms.
*
* The author is committed to making a best effort at fixing any errors
* found in the software and would welcome any reports of problems,
* comments or suggestions regarding the software. Please send email
* to <[email protected]>.
*/

/*
* Abstract and Edit History
*
* dmextract
* This program was designed to read the contents of a DECmail-11
* .MAI file (an indexed file used on RSX and RSTS machines) and
* to extract the messages so that they can then be read using
* standard U*x mail utilities. Input is from the specified file,
* while output is to standard output so that it can either be
* filtered or redirected to an output file.
*
* Edit History:
*
* (000) 07-Dec-2001 Megan Gentry
* Final coding so that the program dumps most of my message
* files. The contents of the file it writes are still not
* properly processed by U*x mail(x) programs -- they concatenate
* some messages (even though all messages are written in
* exactly the same sequence... maybe there is a problem with
* header or message content).
*
* (001) 07-Dec-2001 Megan Gentry
* Okay, the 'From' line appears to need more than one field
* on the line for the message to be properly recognized. A
* temporary workaround has been to output something else (like
* the date) for those 'From' lines with only one field.
* Now the problem is that some lines (notably those from
* VMS hosts) have names or witty sayings within quotes in
* the from field, which display improperly using U*x mail.
*
*/

#include <stdio.h>
#include <string.h>

#define BLOCK (512)

char copyright[] = "dmextract.c, Copyright (c) 2001 by Megan Gentry";

typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned long ulong;

int debug = 0;

/*
* Message descriptor structure
*/

struct msginfo {
struct msginfo *next; /* Pointer to next record */
struct msginfo *prev; /* Pointer to previous record */
/* not really neaded */
ulong number; /* Message number */
struct {
ulong spos; /* File position for header start */
ulong epos; /* File position for header end */
char from[256]; /* From record from header */
char date[256]; /* Date record from header */
char to[256]; /* To record from header */
char cc[256]; /* Cc record from header */
char subject[256]; /* Subj record from header */
char mid[256]; /* For duplicate date (?) from header */
} header;
struct {
ulong spos; /* File position for text start */
ulong epos; /* File position for text end */
ulong lines; /* Count of lines of text */
} text;
};

/*
* Message descriptor list head structure
*/

struct msghead {
struct msginfo *head;
struct msginfo *tail;
ulong count;
};

struct msghead msglist; /* Head of message descriptor list */

char cuspname[132]; /* Execution name of this program */
char mailfile[132]; /* Name of mailfile to be referenced */

uchar buffer[BLOCK];

unsigned short
getushort(fp)
FILE *fp;
{
char sbuf[2];
short lo, hi;

fread (sbuf, 2, 1, fp);

lo = sbuf[0] & 0377;
hi = sbuf[1] & 0377;
return (hi * 0400 + lo);
}

main (argc, argv)
int argc;
char *argv[];
{
FILE *fopen(), *fp;
int i, j, n;
int reclen;
char ch;
uchar c1, c2, c3, c4;
int hdrfg, hdrct;
int txtfg, txtct, prtfg;
ushort *wordp;
struct msginfo *msgp;

/* Save the cusp name */
strcpy (cuspname, argv[0]);

/* Assume the mail.mai file */
strcpy (mailfile, "mail.mai");

if (argc == 2)
strcpy (mailfile, argv[1]);
else if (argc > 2) {
fprintf (stderr, "usage: %s [mail_file]\n", cuspname);
exit (1);
}

/*
* In case file was specified without ".mai" filetype, append it
*/
if (rindex(mailfile,'.') == NULL) {
strcat (mailfile, ".mai");
fprintf (stderr, "warning (0): using file %s\n", mailfile);
}

/* open the mail file */
if ((fp = fopen(mailfile, "rb")) == NULL) {
fprintf (stderr, "%s: file not found %s\n", cuspname, mailfile);
exit (1);
}

/*
* In this first pass, we scan the file to find the start and end
* of all the message headers and text blocks in the file and build
* a linked list of messages with that information.
*/
bzero (msglist, sizeof(struct msghead));

hdrfg = hdrct = 0;
txtfg = txtct = 0;

/* Preload the character pipe-line */
c1 = getc (fp) & 0377;
c2 = getc (fp) & 0377;
c3 = getc (fp) & 0377;

do {
/* Get the next character from the file */
c4 = getc (fp) & 0377;

/*
* See if the current four characters delineate a
* message header
*/
if ( ((c2 * 0400) + c1) == 01 && ((c4 * 0400) + c3) == 0377) {
if (hdrfg == 0) {
/* We've located the start of a message header */
hdrfg = 1;
hdrct++;

if (txtfg) {
fprintf (stderr, "warning (1): message header found while processing message %d text block\n", txtct);
txtfg = 0;
}

/* Allocate a new message descriptor */
msgp = (struct msginfo *) malloc (sizeof(struct msginfo));
if (!msgp) {
fprintf (stderr, "fatal (1): unable to allocate memory for message %d\n", hdrct);
exit (1);
}

/* Initialize descriptor */
bzero (msgp, sizeof(struct msginfo));
msgp->number = hdrct;
msgp->header.spos = ftell(fp)-4;

/* Link the new descriptor into the list */
if (msglist.head) {
msgp->prev = msglist.tail;
(msglist.tail)->next = msgp;
msglist.tail = msgp;
} else {
msglist.head = msgp;
msglist.tail = msgp;
}

/* Keep track of number of messages */
msglist.count++;
} else {
/* We've located the end of the message header */
msgp->header.epos = ftell(fp)-4;
hdrfg = 0;
}
}

/*
* See if the current four characters delineate a
* text block
*/
if ( ((c2 * 0400) + c1) == 01 && ((c4 * 0400) + c3) == 0204) {
if (txtfg == 0) {
/* We've located the beginning of the message */
txtfg = 1;
txtct++;
msgp->text.spos = ftell(fp)-4;
} else {
/* We've located the end of the message */
msgp->text.epos = ftell(fp)-4;
txtfg = 0;
}
}

/* The pipe-line shifts */
c1 = c2;
c2 = c3;
c3 = c4;
} while (!feof(fp));

/*
* Verify that we have the same number of headers as text blocks
*/
if (hdrct != txtct) {
fprintf (stderr, "warning (1): %d headers, %d text blocks\n",
hdrct, txtct);
}

fprintf (stderr, "info (1): %s appears to contain %d %s\n",
mailfile, msglist.count,
msglist.count == 1 ? "message" : "messages");

/*
* In this pass, we extract message header information to fill
* each entry in our message descriptor list.
*/
for (msgp = msglist.head; msgp != NULL; msgp=msgp->next) {
fseek (fp, msgp->header.spos+4, SEEK_SET);
while (ftell(fp) < msgp->header.epos
&& (reclen = getushort(fp)) != 01) {
fread (buffer, ((reclen+1)&~1), 1, fp);
reclen = reclen <= 255 ? reclen : 255;
buffer[reclen] = 0;
switch ((uchar)buffer[0]) {
case 0231: /* Undocumented field type */
/* Appears to duplicate date */
strcpy (&msgp->header.mid[0], &buffer[1]);
if (strlen(&msgp->header.date[0]) == 0)
strcpy (&msgp->header.date[0], &msgp->header.mid[0]);
break;
case 0201: /* From: field */
strcpy (&msgp->header.from[0], &buffer[1]);
break;
case 0202: /* Date: field */
strcpy (&msgp->header.date[0], &buffer[1]);
break;
case 0205: /* To: field */
strcpy (&msgp->header.to[0], &buffer[1]);
break;
case 0206: /* Cc: field */
strcpy (&msgp->header.cc[0], &buffer[1]);
break;
case 0207: /* Subject: field */
strcpy (&msgp->header.subject[0], &buffer[1]);
break;
}
}
}

/*
* In this pass, we process the counted records comprising the
* message to 1) count the number of lines of text so we can
* report it, and 2) so that we can ensure that processing
* continues to the end of a message block.
*/

/*
* NOTE:
* It appears that different versions of DECmail did different
* things with regard to the text records. I believe they are
* are supposed to be counted records, two bytes count, with
* n bytes of text, padded with null bytes to account for
* odd counts. The problem is that this is not what I found in
* practice in all cases. And in some cases, the counts are
* fine up to some point, and then they are simply wrong.
* So, combining the algorithm I use for determining line length
* with a recovery algorithm when the next line's record count
* looks wrong, this seems to work best. At the worst, I've
* noticed that sometimes, the final character of a line will
* be lost for all lines of a message, but not all lines of
* all messages in that same file.
*/
for (msgp = msglist.head; msgp != NULL; msgp=msgp->next) {
int linct = 0;
fseek (fp, msgp->text.spos+4, SEEK_SET);
while (ftell(fp) < msgp->text.epos) {
reclen = getushort(fp);
linct++;
if (reclen > 255) {
fprintf (stderr, "warning (3): Message %04d, Line %04d, filpos 0%012lo, reclen %05d\n",
msgp->number, linct, ftell(fp)-2, reclen);
fprintf (stderr, "info (3): adjusting file position\n");
fseek (fp, ftell(fp)-1, SEEK_SET);
--linct;
continue;
}
if (reclen == 0) continue;
if (reclen & 01)
fread (buffer, reclen, 1, fp);
else
fread (buffer, reclen-1, 1, fp);
msgp->text.lines++;
}
if (ftell(fp) == msgp->text.epos) continue;
fprintf (stderr, "error (3): While processing message %d:\n",
msgp->number);
fprintf (stderr, " Processing ended at file position 0%012lo\n",
ftell(fp));
fprintf (stderr, " Text block ends at file position 0%012lo\n",
msgp->text.epos);
}

/*
* In this pass, we finally start outputting the information
* obtained from the headers, followed by the associated
* message text. Output is done in such a way that it can
* be read by U*x standard mail utilities (hopefully).
*/
for (msgp = msglist.head; msgp != NULL; msgp=msgp->next) {

/*
* Output the 'From' field. If there is only one
* field in the text, add the date as a second field
* on the line so that U*x mail utilities can properly
* identify starts of messages
*/
if (index(msgp->header.from,' '))
printf ("From %s\n", &msgp->header.from[0]);
else
printf ("From %s %s\n",
&msgp->header.from[0],
&msgp->header.date[0]);

printf ("From: %s\n", &msgp->header.from[0]);

/* There should always be a recipient specified */
printf ("To: %s\n", &msgp->header.to[0]);

/* There doesn't always have to be a CC list */
if (strlen(&msgp->header.cc[0]))
printf ("Cc: %s\n", &msgp->header.cc[0]);

/* There always has to be a date */
printf ("Date: %s\n", &msgp->header.date[0]);

/* There doesn't always have to be a subject */
if (strlen(&msgp->header.subject[0]))
printf ("Subject: %s\n", &msgp->header.subject[0]);

/* Let's assume the mail files were read at some point */
printf ("Status: RO\n");

/* This is dmextract-specific info for debugging */
printf ("DECmail-Info: %s\n", &msgp->header.mid[0]);
printf ("DMextract-Number: %d\n", msgp->number);
printf ("DMline-Count: %d\n", msgp->text.lines);

/* Separate header from text with a blank line */
printf ("\n");

/* Now we start outputting the message text */
fseek (fp, msgp->text.spos+4, SEEK_SET);
while (ftell(fp) < msgp->text.epos) {
reclen = getushort(fp);
if (reclen > 255) {
fseek (fp, ftell(fp)-1, SEEK_SET);
continue;
}
if (reclen == 0) continue;
if (reclen & 01)
fread (buffer, reclen, 1, fp);
else
fread (buffer, reclen-1, 1, fp);
buffer[reclen] = 0;
if (strncmp(&buffer[0],"From",4) == 0)
printf (">");
for (i = 0; i < reclen-1; i++) {
if (buffer[i] == 012) break;
if (buffer[i] == 015) {
(void) getc (fp);
continue;
}
printf ("%c", buffer[i]);
}
printf ("\n");
}
printf ("\n");
}
}