/* This code filters out almost all the "junk" in word documents and
* is useful for extracting the text from word documents. I think this
* program produces better signal to noise that catdoc. */
/* Junk filtering. Buffer size must be >=RESUME_CHARS */
int word_junk_filter::filter_junk(const char *b, int ns)
{
int i, d, n, top_run;
if ((n=__fill(b, ns, junk_end_usage, 1))==0)
return 0;
if (junk_end_usage>0)
memcpy(buf+ns+1, junk_end_buf, junk_end_usage);
if (init==INIT_SKIPPED)
{
init=INIT_DONE; // Finished init
ns=0; // Process all the data
text_size=0; // Text size is zero
}
/* Correctness of this loop depends on d<=i at the top of the
loop. The claims * by inductioon about the fake variable __adj__ suffice
to prove this. All the relevant commments are prefixed by CS */
top_run=0;
/*** CS: __adj__=1 ****/
for (i=ns+1, d=ns; i<=n; i++) // Ignore already processed data pushed back
{
/*** CS: i==d+junk_end_usage+__adj__ and __adj__>=0 here by induction ***/
/*** CS: if mode==SKIP_JUNK or SKIP_JUNK_WASPRN then __adj__>=1 ***/
#ifdef DEBUG_FILTER
printf("mode %d, char[%i] (t=%d) %c, asc=%d\n", mode, i, n,
buf[i], junk_end_usage);
#endif
switch(mode)
{
case NORMAL:
/* Normal mode. OK becasue change in d at most 1 and junk_end_usage=0 */
if (buf[i]==0)
{
junk_end_usage=0;
junk_size=0;
mode=SKIP_JUNK;
break;
}
if (buf[i] & 0x80)
{
/*** CS: by induction junk_end_usage==top_run ***/
junk_end_buf[junk_end_usage++]=buf[i]; // Save character
top_run++; // Increment top run counter
/*** CS: __adj__++ ***/
if (top_run==tune.max_top_run)
{
junk_size=top_run; // Top bit characters are junk
junk_end_usage=0; // Throw away characters
mode=SKIP_JUNK; // End junk skipping mode
/*** CS: __adj__ += top_run ***/
break;
}
break; // Exit switch
}
/* Top bit not set */
if (top_run)
{
memcpy(buf+d, junk_end_buf, junk_end_usage); // OK by ind hyp.
d+=junk_end_usage;
junk_end_usage=0; // Ind. hyp now true
top_run=0; // Reset top_run counter.
}
/* Add character */
buf[d++]=buf[i];
text_size++;
if (unicode!=__UNI_NO)
mode=UNICODE_Z;
break;
case UNICODE_Z:
/* unicode zero mode. OK because change in d is 0 and junk_end_usage=0 */
if (buf[i]==0)
{
mode=NORMAL; // 0 => Back to printable character mode
/*** CS: __adj__++ ***/
}
else
{
#ifdef UNICODE_MARKERS
printf("\n*** Mode %d: %d Got %d expected 0\n", mode,
UNICODE_Z, buf[i]);
#endif
junk_end_usage=0;
junk_size=0;
mode=SKIP_JUNK;
/*** CS: __adj__++ ***/
}
break;
case SKIP_JUNK_WASPRN:
/* Skip junk was printable, added for unicode documents */
/* OK because junk_end_usage chages by at most 1 and d stays fixed */
if ((++junk_size)==tune.max_junk && text_size>tune.min_text)
{
#ifdef DEBUG_SINK_START
cerr<<"Enter sink mode\n";
#endif /* DEBUG_SINK_START */
/*** CS: __adj__++ ***/
mode=SINK_JUNK; // Sink junk
break;
}
if (tune.options.unicode_aggresive)
{
/* Agresive unicode mode: insist on the MSBs appearing */
if (unicode!=__UNI_NO)
{
if (buf[i]!=0)
{
/*** CS: __adj__+=junk_end_usage ***/
junk_end_usage=0;
}
mode=SKIP_JUNK;
/*** CS: __adj__++ ***/
break;
}
}
else
{
/* Non-aggersive unicode mode: simply skip the MSB */
if (buf[i]==0 && unicode!=__UNI_NO)
{
/*** CS: __adj__++ ***/
break; // Accept zeros
}
}
/* Fall thru */
case SKIP_JUNK:
/* Junk skipping mode */
if ((++junk_size)==tune.max_junk && text_size>tune.min_text)
{
#ifdef DEBUG_SINK_START
cerr<<"Enter sink mode\n";
#endif /* DEBUG_SINK_START */
mode=SINK_JUNK; // Sink junk
/*** CS: __adj__++ ***/
break;
}
if (isprint(buf[i]) || buf[i]=='\r' || buf[i]=='\n'
|| buf[i]=='\t')
{
#ifdef DEBUG_RESTART
printf("---Accepted *%c*, usage=%d\n", buf[i], junk_end_usage);
#endif
junk_end_buf[junk_end_usage++]=buf[i]; // Save character
if (unicode!=__UNI_NO)
mode=SKIP_JUNK_WASPRN; // For unicode documents
}
else
{
/* Reset junk buffer usage counter */
/*** CS: __adj__+=junk_end_usage+1 ***/
#ifdef DEBUG_RESTART
if (junk_end_usage!=0)
printf("---Rejected %d, usage=%d\n", buf[i], junk_end_usage);
#endif
junk_end_usage=0;
break;
}
if (junk_end_usage<tune.resume_chars)
break;
/* Turn nomral mode back on */
#ifdef DEBUG_RESTART
printf("---back normal mode count=%d\n", junk_size);
#endif
/* The __adj__ analysis is *vital* here; if __Adj__=0 then d>i occurs */
/* however forutnately this is impossible by the claims that __adj__ */
/* and indcution proves. */
/**** CS: __adj__>=1 here by indcution. Thus i>=d+junk_end_usage+1 ****/
mode=(unicode==__UNI_NO) ? NORMAL : UNICODE_Z;
buf[d]=0x7f; // Mark suspect position
memcpy(buf+d+1, junk_end_buf, junk_end_usage); // Nothing clobbered by [1]
d+=junk_end_usage+1; // Advance d
junk_end_usage=0; // Clear junk end buffer
/*** CS: __adj__-- ***/
break;
int word_junk_filter::underflow(void)
{
int s,d,last;
int end;
do // While no characters
{
/* Fill the buffer */
if (init==INIT_NONE)
{
if (is!=NULL)
{
init=INIT_SKIPPED;
ns=this->skip_to_start();
end=this->filter_junk(buf, ns);
}
else
return EOF; // If no input stream return EOF
}
else
{
end=this->filter_junk(&save, ns);
ns=0;
}
/* Check for EOF */
if (end==0 && is->eof())
return EOF;
ns=0;
/* Collapse multi-character sequeneces */
for (s=0, d=0, last=-1; s<end; s++)
{
switch(buf[s])
{
case '\n':
if (last=='\r')
continue; // \n \r to \r
break;
case '\r':
if (last=='\n')
d--; // \r \n to \r
break;
case 0x07:
if (last=='\r')
d--; // \n <TAB SEP> to <TAB SEP>
break;
/* Skip to start */
int word_junk_filter::skip_to_start(void)
{
int zc,ffc,c,n,i,rej,limit=-1;
char back[2*ST_ASC_VCHARS_UNICODE+ST_REJ_LIMIT+4];
int bp;
long pos=0;
enum { FIND_BEGIN, SKIP_ONE, FIND_ASC, FIND_ASC_WASASC, DONE } mode;
zc=0; ffc=0; i=0; n=0; bp=0; rej=-1;
mode=FIND_BEGIN;
while (mode!=DONE)
{
/* If need a refill */
if (i==n)
{
/* Fill buffer */
n=this->__fill(back,bp,0,1);
if (n==0)
return 0;
pos+=n-bp;
#ifdef DEBUG_SKIP_FILL
printf("---skip--- refill bp=%x, pos=%x\n", bp, pos);
#endif
/* Ignore first saved character */
if (mode==FIND_ASC)
mode=SKIP_ONE;
i=0;
bp=0;
}
#ifdef DEBUG_SKIP
printf("char %c mode %d, c=%d, pos=%x, i=%d l=%d, rej=%d\n", buf[i+1],
mode, c, pos++, i, limit, rej);
#endif
switch(mode)
{
case FIND_BEGIN:
switch(buf[i+1])
{
case 0:
zc++;
ffc=0;
break;
case ((char) 0xff):
ffc++;
zc=0;
break;
default:
if (zc>=tune.st_min_zeros ||
(ffc>=tune.st_min_ff && tune.options.ff_intro))
{
unicode=__UNI_PROBE;
back[bp++]=buf[i+1];
c=0;
rej=0;
mode=FIND_ASC_WASASC; // Ignore this character deliberately
}
else
{
zc=0;
ffc=0;
}
break;
}
break;
case SKIP_ONE:
back[bp++]=buf[i+1];
c=0;
rej=0;
mode=FIND_ASC_WASASC;
break;
case FIND_ASC_WASASC: // unicode support
if (buf[i+1]==0 && unicode!=__UNI_NO)
{
if (unicode==__UNI_PROBE)
{
limit=tune.unicode_st; // Start characters needed
/* FIXME: Anyone got a better method of doing this? */
if ((pos+i) % 2==0)
{
unicode=__UNI_YES_BE; // 0 is first (even) bu#yte
}
else
{
unicode=__UNI_YES_LE; // 0 is second (odd) byte
}
back[bp++]=buf[i+1];
mode=FIND_ASC; // Inist next character not zero
#ifdef DEBUG_SKIP_UNI
cerr<<"Characters are unicode "
<<((unicode==__UNI_YES_LE) ? "litle" : "big")
<<" endian\n";
#endif
}
else
back[bp++]=buf[i+1];
c++;
break;
}
else if (unicode==__UNI_PROBE)
{
#ifdef DEBUG_SKIP_UNI
if (unicode==__UNI_PROBE)
cerr<<"Characters are not unicode\n";
#endif
limit=tune.non_unicode_st; // Start characters needed
unicode=__UNI_NO; // Non unicode
mode=FIND_ASC; // Finding printable mode
}
/* fall thru */
case FIND_ASC:
if (isprint(buf[i+1]) || buf[i+1]=='\r' || buf[i+1]=='\n'
|| buf[i+1]=='\t') // If is printable
{
back[bp++]=buf[i+1]; // Store character
c++; // Increase count
if (unicode!=__UNI_NO)
mode=FIND_ASC_WASASC; // Unicode hack...
}
else
{
back[bp++]=buf[i+1]; // Store caharcetr
if (buf[i+1]!=0)
rej++; // Adjust reject count
if (buf[i+1]==0 || rej>tune.st_rej_limit) // 0 or reject limit
{
bp=0; // Reset buffer pointer
unicode=__UNI_PROBE; // Set unicode status to unknonw
ffc=(buf[i+1]== (char) 0xff) ? 1 : 0; // reset ff count
zc=(buf[i+1]==0) ? 1 : 0; // reset zeros count
mode=FIND_BEGIN ; // Find leading characters mode
}
}
if (limit==-1) // Bug catcher
{
cerr<<"Bug: limit still -1 at point #1\n";
exit(2);
}
if (c==limit) // IF found enough
{
#ifdef DEBUG_SKIP
printf("***** Enter exit phase ****\n");
#endif
mode=DONE; // Finished skipping
}
break;
case DONE:
cerr<<"skip_to_start: loop called in done mode\n";
break;
/* Option handling stuff for tuning this module. Note that only long options
* are supported. If you lack long options these features are not avialable. */
/* set default tuning */
void word_junk_filter::set_dfl_tuning(void)
{
#ifdef AGGRESIVE_UNICODE
tune.options.unicode_aggresive=AGGRESIVE_UNICODE; // Aggresive unicode mode
#else
tune.options.unicode_aggresive=1; // Aggresive unicode mode
#endif /* AGGRESIVE_UNICODE */
tune.resume_chars=RESUME_CHARS; // Resume characters
tune.st_min_zeros=ST_MIN_ZEROS; // Minimum zeros
tune.st_rej_limit=ST_REJ_LIMIT; // Reject chars tolerance
tune.st_min_ff=ST_MIN_FF; // Minimum ff's (alternative to 0s)
tune.options.ff_intro=ALLOW_FF_LEADIN; // Enable leading ff's
tune.non_unicode_st=ST_ASC_VCHARS; // ok characters leadin
tune.unicode_st=ST_ASC_VCHARS_UNICODE; // ok unicode characters
tune.min_text=SINK_MIN_TEXT; // text required before sink mode
tune.max_junk=SINK_JUNK_CNT; // junk required for sink mode
tune.max_top_run=MAX_TOP_RUN; // Max run length of top bits set
}