#include <regexp.h>

#include <u.h>
#include <libc.h>
#include <bio.h>
#include <regexp.h>
#include "spam.h"

enum {
Quanta = 8192,
Minbody = 6000,
HdrMax = 15,
};

typedef struct keyword Keyword;
typedef struct word Word;

struct word{
char *string;
int n;
};

struct keyword{
char *string;
int value;
};

Word htmlcmds[] =
{
"html", 4,
"!doctype html", 13,
0,

};

Word hrefs[] =
{
"a href=", 7,
"a title=", 8,
"a target=", 9,
"base href=", 10,
"img src=", 8,
"img border=", 11,
"form action=", 12,
"!--", 3,
0,

};

/*
* RFC822 header keywords to look for for fractured header.
* all lengths must be less than HdrMax defined above.
*/
Word hdrwords[] =
{
"cc:", 3,
"bcc:", 4,
"to:", 3,
0, 0,

};

Keyword keywords[] =
{
"header", HoldHeader,
"line", SaveLine,
"hold", Hold,
"dump", Dump,
"loff", Lineoff,
0, Nactions,
};

Patterns patterns[] = {
[Dump] { "DUMP:", 0, 0 },
[HoldHeader] { "HEADER:", 0, 0 },
[Hold] { "HOLD:", 0, 0 },
[SaveLine] { "LINE:", 0, 0 },
[Lineoff] { "LINEOFF:", 0, 0 },
[Nactions] { 0, 0, 0 },
};

static char* endofhdr(char*, char*);
static int escape(char**);
static int extract(char*);
static int findkey(char*);
static int hash(int);
static int isword(Word*, char*, int);
static void parsealt(Biobuf*, char*, Spat**);

/*
* The canonicalizer: convert input to canonical representation
*/
char*
readmsg(Biobuf *bp, int *hsize, int *bufsize)
{
char *p, *buf;
int n, offset, eoh, bsize, delta;

buf = 0;
offset = 0;
if(bufsize)
*bufsize = 0;
if(hsize)
*hsize = 0;
for(;;) {
buf = Realloc(buf, offset+Quanta+1);
n = Bread(bp, buf+offset, Quanta);
if(n < 0){
free(buf);
return 0;
}
p = buf+offset; /* start of this chunk */
offset += n; /* end of this chunk */
buf[offset] = 0;
if(n == 0){
if(offset == 0)
return 0;
break;
}

if(hsize == 0) /* don't process header */
break;
if(p != buf && p[-1] == '\n') /* check for EOH across buffer split */
p--;
p = endofhdr(p, buf+offset);
if(p)
break;
if(offset >= Maxread) /* gargantuan header - just punt*/
{
if(hsize)
*hsize = offset;
if(bufsize)
*bufsize = offset;
return buf;
}
}
eoh = p-buf; /* End of header */
bsize = offset - eoh; /* amount of body already read */

/* Read at least Minbody bytes of the body */
if (bsize < Minbody){
delta = Minbody-bsize;
buf = Realloc(buf, offset+delta+1);
n = Bread(bp, buf+offset, delta);
if(n > 0) {
offset += n;
buf[offset] = 0;
}
}
if(hsize)
*hsize = eoh;
if(bufsize)
*bufsize = offset;
return buf;
}

static int
isword(Word *wp, char *text, int len)
{
for(;wp->string; wp++)
if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0)
return 1;
return 0;
}

static char*
endofhdr(char *raw, char *end)
{
int i;
char *p, *q;
char buf[HdrMax];

/*
* can't use strchr to search for newlines because
* there may be embedded NULL's.
*/
for(p = raw; p < end; p++){
if(*p != '\n' || p[1] != '\n')
continue;
p++;
for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){
buf[i++] = tolower(*q);
if(*q == ':' || *q == '\n')
break;
}
if(!isword(hdrwords, buf, i))
return p+1;
}
return 0;
}

static int
htmlmatch(Word *wp, char *text, char *end, int *n)
{
char *cp;
int i, c, lastc;
char buf[MaxHtml];

/*
* extract a string up to '>'
*/

i = lastc = 0;
cp = text;
while (cp < end && i < sizeof(buf)-1){
c = *cp++;
if(c == '=')
c = escape(&cp);
switch(c){
case 0:
case '\r':
continue;
case '>':
goto out;
case '\n':
case ' ':
case '\t':
if(lastc == ' ')
continue;
c = ' ';
break;
default:
c = tolower(c);
break;
}
buf[i++] = lastc = c;
}
out:
buf[i] = 0;
if(n)
*n = cp-text;
return isword(wp, buf, i);
}

static int
escape(char **msg)
{
int c;
char *p;

p = *msg;
c = *p;
if(c == '\n'){
p++;
c = *p++;
} else
if(c == '2'){
c = tolower(p[1]);
if(c == 'e'){
p += 2;
c = '.';
}else
if(c == 'f'){
p += 2;
c = '/';
}else
if(c == '0'){
p += 2;
c = ' ';
}
else c = '=';
} else {
if(c == '3' && tolower(p[1]) == 'd')
p += 2;
c = '=';
}
*msg = p;
return c;
}

static int
htmlchk(char **msg, char *end)
{
int n;
char *p;

static int ishtml;

p = *msg;
if(ishtml == 0){
ishtml = htmlmatch(htmlcmds, p, end, &n);

/* If not an HTML keyword, check if it's
* an HTML comment (<!comment>). if so,
* skip over it; otherwise copy it in.
*/
if(ishtml == 0 && *p != '!') /* not comment */
return '<'; /* copy it */

} else if(htmlmatch(hrefs, p, end, &n)) /* if special HTML string */
return '<'; /* copy it */

/*
* this is an uninteresting HTML command; skip over it.
*/
p += n;
*msg = p+1;
return *p;
}

/*
* decode a base 64 encode body
*/
void
conv64(char *msg, char *end, char *buf, int bufsize)
{
int len, i;
char *cp;

len = end - msg;
i = (len*3)/4+1; // room for max chars + null
cp = Malloc(i);
len = dec64((uchar*)cp, i, msg, len);
convert(cp, cp+len, buf, bufsize, 1);
free(cp);
}

int
convert(char *msg, char *end, char *buf, int bufsize, int isbody)
{

char *p;
int c, lastc, base64;

lastc = 0;
base64 = 0;
while(msg < end && bufsize > 0){
c = *msg++;

/*
* In the body only, try to strip most HTML and
* replace certain MIME escape sequences with the character
*/
if(isbody) {
do{
p = msg;
if(c == '<')
c = htmlchk(&msg, end);
if(c == '=')
c = escape(&msg);
} while(p != msg && p < end);
}
switch(c){
case 0:
case '\r':
continue;
case '\t':
case ' ':
case '\n':
if(lastc == ' ')
continue;
c = ' ';
break;
case 'C': /* check for MIME base 64 encoding in header */
case 'c':
if(isbody == 0)
if(msg < end-32 && *msg == 'o' && msg[1] == 'n')
if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0)
base64 = 1;
c = 'c';
break;
default:
c = tolower(c);
break;
}
*buf++ = c;
lastc = c;
bufsize--;
}
*buf = 0;
return base64;
}

/*
* The pattern parser: build data structures from the pattern file
*/

static int
hash(int c)
{
return c & 127;
}

static int
findkey(char *val)
{
Keyword *kp;

for(kp = keywords; kp->string; kp++)
if(strcmp(val, kp->string) == 0)
break;
return kp->value;
}

#define whitespace(c) ((c) == ' ' || (c) == '\t')

void
parsepats(Biobuf *bp)
{
Pattern *p, *new;
char *cp, *qp;
int type, action, n, h;
Spat *spat;

for(;;){
cp = Brdline(bp, '\n');
if(cp == 0)
break;
cp[Blinelen(bp)-1] = 0;
while(*cp == ' ' || *cp == '\t')
cp++;
if(*cp == '#' || *cp == 0)
continue;
type = regexp;
if(*cp == '*'){
type = string;
cp++;
}
qp = strchr(cp, ':');
if(qp == 0)
continue;
*qp = 0;
if(debug)
fprint(2, "action = %s\n", cp);
action = findkey(cp);
if(action >= Nactions)
continue;
cp = qp+1;
n = extract(cp);
if(n <= 0 || *cp == 0)
continue;

qp = strstr(cp, "~~");
if(qp){
*qp = 0;
n = strlen(cp);
}
if(debug)
fprint(2, " Pattern: `%s'\n", cp);

/* Hook regexps into a chain */
if(type == regexp) {
new = Malloc(sizeof(Pattern));
new->action = action;
new->pat = regcomp(cp);
if(new->pat == 0){
free(new);
continue;
}
new->type = regexp;
new->alt = 0;
new->next = 0;

if(qp)
parsealt(bp, qp+2, &new->alt);

new->next = patterns[action].regexps;
patterns[action].regexps = new;
continue;

}
/* not a Regexp - hook strings into Pattern hash chain */
spat = Malloc(sizeof(*spat));
spat->next = 0;
spat->alt = 0;
spat->len = n;
spat->string = Malloc(n+1);
spat->c1 = cp[1];
strcpy(spat->string, cp);

if(qp)
parsealt(bp, qp+2, &spat->alt);

p = patterns[action].strings;
if(p == 0) {
p = Malloc(sizeof(Pattern));
memset(p, 0, sizeof(*p));
p->action = action;
p->type = string;
patterns[action].strings = p;
}
h = hash(*spat->string);
spat->next = p->spat[h];
p->spat[h] = spat;
}
}

static void
parsealt(Biobuf *bp, char *cp, Spat** head)
{
char *p;
Spat *alt;

while(cp){
if(*cp == 0){ /*escaped newline*/
do{
cp = Brdline(bp, '\n');
if(cp == 0)
return;
cp[Blinelen(bp)-1] = 0;
} while(extract(cp) <= 0 || *cp == 0);
}

p = cp;
cp = strstr(p, "~~");
if(cp){
*cp = 0;
cp += 2;
}
if(strlen(p)){
alt = Malloc(sizeof(*alt));
alt->string = strdup(p);
alt->next = *head;
*head = alt;
}
}
}

static int
extract(char *cp)
{
int c;
char *p, *q, *r;

p = q = r = cp;
while(whitespace(*p))
p++;
while(c = *p++){
if (c == '#')
break;
if(c == '"'){
while(*p && *p != '"'){
if(*p == '\\' && p[1] == '"')
p++;
if('A' <= *p && *p <= 'Z')
*q++ = *p++ + ('a'-'A');
else
*q++ = *p++;
}
if(*p)
p++;
r = q; /* never back up over a quoted string */
} else {
if('A' <= c && c <= 'Z')
c += ('a'-'A');
*q++ = c;
}
}
while(q > r && whitespace(q[-1]))
q--;
*q = 0;
return q-cp;
}

/*
* The matching engine: compare canonical input to pattern structures
*/

static Spat*
isalt(char *message, Spat *alt)
{
while(alt) {
if(*cmd)
if(message != cmd && strstr(cmd, alt->string))
break;
if(message != header+1 && strstr(header+1, alt->string))
break;
if(strstr(message, alt->string))
break;
alt = alt->next;
}
return alt;
}

int
matchpat(Pattern *p, char *message, Resub *m)
{
Spat *spat;
char *s;
int c, c1;

if(p->type == string){
c1 = *message;
for(s=message; c=c1; s++){
c1 = s[1];
for(spat=p->spat[hash(c)]; spat; spat=spat->next){
if(c1 == spat->c1)
if(memcmp(s, spat->string, spat->len) == 0)
if(!isalt(message, spat->alt)){
m->sp = s;
m->ep = s + spat->len;
return 1;
}
}
}
return 0;
}
m->sp = m->ep = 0;
if(regexec(p->pat, message, m, 1) == 0)
return 0;
if(isalt(message, p->alt))
return 0;
return 1;
}

void
xprint(int fd, char *type, Resub *m)
{
char *p, *q;
int i;

if(m->sp == 0 || m->ep == 0)
return;

/* back up approx 30 characters to whitespace */
for(p = m->sp, i = 0; *p && i < 30; i++, p--)
;
while(*p && *p != ' ')
p--;
p++;

/* grab about 30 more chars beyond the end of the match */
for(q = m->ep, i = 0; *q && i < 30; i++, q++)
;
while(*q && *q != ' ')
q++;

fprint(fd, "%s %.*s~%.*s~%.*s\n", type,
utfnlen(p, m->sp-p), p,
utfnlen(m->sp, m->ep-m->sp), m->sp,
utfnlen(m->ep, q-m->ep), m->ep);
}

enum {
INVAL= 255
};

static uchar t64d[256] = {
/*00 */ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*10*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*20*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
INVAL, INVAL, INVAL, 62, INVAL, INVAL, INVAL, 63,
/*30*/ 52, 53, 54, 55, 56, 57, 58, 59,
60, 61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*40*/ INVAL, 0, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14,
/*50*/ 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, INVAL, INVAL, INVAL, INVAL, INVAL,
/*60*/ INVAL, 26, 27, 28, 29, 30, 31, 32,
33, 34, 35, 36, 37, 38, 39, 40,
/*70*/ 41, 42, 43, 44, 45, 46, 47, 48,
49, 50, 51, INVAL, INVAL, INVAL, INVAL, INVAL,
/*80*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*90*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*A0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*B0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*C0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*D0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*E0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
/*F0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
};