typedef struct keyword Keyword;
typedef struct word Word;
struct word{
char *string;
int n;
};
struct keyword{
char *string;
int value;
};
Word htmlcmds[] =
{
"html", 4,
"!doctype html", 13,
0,
};
Word hrefs[] =
{
"a href=", 7,
"a title=", 8,
"a target=", 9,
"base href=", 10,
"img src=", 8,
"img border=", 11,
"form action=", 12,
"!--", 3,
0,
};
/*
* RFC822 header keywords to look for for fractured header.
* all lengths must be less than HdrMax defined above.
*/
Word hdrwords[] =
{
"cc:", 3,
"bcc:", 4,
"to:", 3,
0, 0,
static char* endofhdr(char*, char*);
static int escape(char**);
static int extract(char*);
static int findkey(char*);
static int hash(int);
static int isword(Word*, char*, int);
static void parsealt(Biobuf*, char*, Spat**);
/*
* The canonicalizer: convert input to canonical representation
*/
char*
readmsg(Biobuf *bp, int *hsize, int *bufsize)
{
char *p, *buf;
int n, offset, eoh, bsize, delta;
buf = 0;
offset = 0;
if(bufsize)
*bufsize = 0;
if(hsize)
*hsize = 0;
for(;;) {
buf = Realloc(buf, offset+Quanta+1);
n = Bread(bp, buf+offset, Quanta);
if(n < 0){
free(buf);
return 0;
}
p = buf+offset; /* start of this chunk */
offset += n; /* end of this chunk */
buf[offset] = 0;
if(n == 0){
if(offset == 0)
return 0;
break;
}
if(hsize == 0) /* don't process header */
break;
if(p != buf && p[-1] == '\n') /* check for EOH across buffer split */
p--;
p = endofhdr(p, buf+offset);
if(p)
break;
if(offset >= Maxread) /* gargantuan header - just punt*/
{
if(hsize)
*hsize = offset;
if(bufsize)
*bufsize = offset;
return buf;
}
}
eoh = p-buf; /* End of header */
bsize = offset - eoh; /* amount of body already read */
/* Read at least Minbody bytes of the body */
if (bsize < Minbody){
delta = Minbody-bsize;
buf = Realloc(buf, offset+delta+1);
n = Bread(bp, buf+offset, delta);
if(n > 0) {
offset += n;
buf[offset] = 0;
}
}
if(hsize)
*hsize = eoh;
if(bufsize)
*bufsize = offset;
return buf;
}
/* If not an HTML keyword, check if it's
* an HTML comment (<!comment>). if so,
* skip over it; otherwise copy it in.
*/
if(ishtml == 0 && *p != '!') /* not comment */
return '<'; /* copy it */
} else if(htmlmatch(hrefs, p, end, &n)) /* if special HTML string */
return '<'; /* copy it */
/*
* this is an uninteresting HTML command; skip over it.
*/
p += n;
*msg = p+1;
return *p;
}
/*
* decode a base 64 encode body
*/
void
conv64(char *msg, char *end, char *buf, int bufsize)
{
int len, i;
char *cp;
len = end - msg;
i = (len*3)/4+1; // room for max chars + null
cp = Malloc(i);
len = dec64((uchar*)cp, i, msg, len);
convert(cp, cp+len, buf, bufsize, 1);
free(cp);
}
int
convert(char *msg, char *end, char *buf, int bufsize, int isbody)
{
char *p;
int c, lastc, base64;
lastc = 0;
base64 = 0;
while(msg < end && bufsize > 0){
c = *msg++;
/*
* In the body only, try to strip most HTML and
* replace certain MIME escape sequences with the character
*/
if(isbody) {
do{
p = msg;
if(c == '<')
c = htmlchk(&msg, end);
if(c == '=')
c = escape(&msg);
} while(p != msg && p < end);
}
switch(c){
case 0:
case '\r':
continue;
case '\t':
case ' ':
case '\n':
if(lastc == ' ')
continue;
c = ' ';
break;
case 'C': /* check for MIME base 64 encoding in header */
case 'c':
if(isbody == 0)
if(msg < end-32 && *msg == 'o' && msg[1] == 'n')
if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0)
base64 = 1;
c = 'c';
break;
default:
c = tolower(c);
break;
}
*buf++ = c;
lastc = c;
bufsize--;
}
*buf = 0;
return base64;
}
/*
* The pattern parser: build data structures from the pattern file
*/