/* index sequential files, producing .idx and .sel files */
/* Bruce Tanner - Cerritos College */
/*
Version History:
1.0 05/10/93 Original attempt
2.0 06/20/93 Create indexed files directly, add keyword count field
2.1 07/08/93 Change the file name for NOISE_WORDS
2.2 07/08/93 Move the range end (end_pos) to before the terminator
2.2jlw 07/14/93 - JLW added length spec to dash, added additional topic
divider keywords
2.3 07/19/93 Set multi-buffer, multi-block counts, read-ahead, write-behind
and deferred write; noticeably increased performance
2.4 07/26/93 Removed index name, added CLI$ interface, added /TOPIC
2.4jlw 07/27/93 fixed version retention, which was broken
2.5 07/27/93 Selector strings forced to lowercase; use a good copy
2.6 07/29/93 revamp /TOPIC syntax to include text, size, exclude
2.7 07/30/93 make SIZE=n pad as well as truncate field width
2.8 08/03/93 take wildcard input file names, add /OUTPUT, /VERSION
2.9 08/05/93 JLW changed filename sizes from 80 to 256 characters
2.10 08/05/93 add check for max number of topics, reformat code
2.11 08/24/93 JLW added specific statuses for exit errors
2.12 10/01/93 add /NODEFAULT_TOPIC to omit topics that have no topic keyword
2.13 11/03/93 add /LINK to generate .link file instead of .idx/.sel
2.14 11/15/93 add /NOISE=file to specify the noise words file
2.15 11/17/93 add /TOPIC=(position), /FIELD=(position, size), /PUNCTUATION
2.16 11/18/93 fix illegal strcpy for AXP, add /MAX_TOPICS
2.17 11/21/93 make load_noise friendlier, add /NOPUNCTUATION support
2.18 11/27/93 add /MINIMUM_WORD, /COUNT_WORDS
2.19 11/30/93 fix broken /TOPIC
2.20 03/20/94 sort words, add /LINK=SORT, /SEQUENTIAL, remove /COUNT_WORDS
2.21 04/29/94 add /NONUMBERS
2.22 06/23/94 add /TOPIC=(offset) /TOPIC=(position=0)
2.23 06/24/94 add /TOPIC=(end)
2.24 06/27/94 add /CANDIDATE, /KEYWORD=(text,end,exclude)
2.24a 06/29/94 replaced VAXC-specific "#include foo" declarations with
more portable "#include <foo.h>" (so DECC won't balk).
2.25 08/04/94 fix /TOPIC=END not matching
2.26 09/15/94 /KEYWORD=END=foo stopped at end of line if 'foo' wasn't found
2.27 09/27/94 change get_text() to return updated pointer to fix mangled text
3.0 09/29/94 redo parsing routines, add /SPECIFICATION, /TOPIC=BREAK
3.1 10/10/94 add /SELECTOR, don't index selector line
3.2 10/17/94 change /SELECTOR to /SELECTOR=(TEXT,END,BOTH)
3.3 11/04/94 add /KEYWORD=(offset), extend selector to include host/port
3.4 11/07/94 add /HELPFILE /SELECTOR=IGNORE
3.5 12/16/94 move close of link file for wildcards
3.6 01/02/95 program around selector.end default problem
*/
#define CHUNK 100 /* increment to expand table of words */
#define DESC_SIZE 500 /* maximum size of a topic description */
#define SELECTOR_SIZE 200 /* maximum size of a selector (minus description) */
#define TOPIC_SIZE 20 /* maximum number of topics to list */
#define PUNCT_CHARS "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
#define DEFAULT_POS 0 /* column to look for topic by default; 0 = anywhere */
#define MAX_INPUT_LINE 500
typedef struct {
char *text;
char *found;
char *end;
int pos;
int size;
int offset;
int exclude;
int force_break;
int used;
} topic_str;
status = lib$get_foreign(&input_dsc, 0, &leng, 0);
for (ind = leng; ind >= 0; ind--)
cli_input[ind+6] = cli_input[ind];
strncpy(cli_input, "index ", 6);
input_dsc.dsc$w_length = leng+6;
status = cli$dcl_parse(&input_dsc, index_commands);
if (status != CLI$_NORMAL) /* error in parse, exit */
exit(7);
if ((cli$present(descr("file")) & 1) == 0) {
printf("Usage: index document\n");
printf(" /CANDIDATES=file specify a file of words for index candidates\n");
printf(" /CHARACTER=n text separated by control character 'n'\n");
printf(" /DASH=n text separated n dashes (default 3)\n");
printf(" /[NO]DEFAULT_TOPIC keep [discard] topics [not] matched by /TOPIC\n");
printf(" /EQUAL=n text separated n equals (default 80)\n");
printf(" /FF text separated by form feeds\n");
printf(" /FIELD=(position,size) specify topic break on field\n");
printf(" /HELPFILE=(selector,title) file to match query \"?\"\n");
printf(" /KEYWORD=(text,end,offset,exclude) specify indexing range\n");
printf(" /LINE each line is separate text entry\n");
printf(" /LINK[=SORT] generate .link file instead of .idx,.sel files\n");
printf(" /MAX_TOPICS=n maximum size of topic ID field (default 4)\n");
printf(" /MINIMUM_WORD=n define minimum word to index (default 3)\n");
printf(" /NOISE=file specify a file of words to omit in the index\n");
printf(" /NONUMBERS omit all numbers from the index\n");
printf(" /OUTPUT=file override name of index/selection files\n");
printf(" /PARAGRAPH text separated by blank lines\n");
printf(" /PUNCTUATION=\"...\" specify the characters that separate words\n");
printf(" /SELECTOR=(text,end,both,ignore) specify selectors to generate\n");
printf(" /SEQUENTIAL create sequential files (.seqidx, .seqsel)\n");
printf(" /SPECIFICATION=file specify a file of switches\n");
printf(" /TOPIC=(text,end,position,size,offset,exclude,break) specify topic names\n");
printf(" /[NO]VERSION keep [discard] document version in selection\n");
printf(" /WHOLE whole file is one text entry\n");
printf(" /WORD_LENGTH=n maximum size of index key (default 20)\n");
exit(1);
}
if (cli$present(descr("specification")) & 1) {
status = cli$get_value(descr("specification"), &spec_dsc, &leng);
spec_name[leng] = '\0';
if ((spc = fopen(spec_name, "r")) == NULL) {
printf("Can't read spec file %s\n", spec_name);
exit(13);
}
/* parse every line of the spec file */
while (fgets(spec_line, sizeof(spec_line), spc)) {
if ((spec_line[0] == '\n') || (spec_line[0] == '#') ||
(spec_line[0] == '!'))
continue; /* skip blank and comment lines */
ptr = strchr(spec_line, '\n');
if (ptr) *ptr = '\0';
leng = strlen(spec_line);
for (ind = leng; ind >= 0; ind--)
spec_line[ind+6] = spec_line[ind];
strncpy(spec_line, "index ", 6);
spec_line_dsc.dsc$w_length = leng+6;
parse_commands(&spec_line_dsc, switches);
}
}
parse_commands(&input_dsc, switches);
if (switch_present("paragraph"))
type = para;
if (switch_present("ff")) {
type = hex; /* /FF same as /character=12 */
hex_value = '\f';
}
if (switch_present("character")) {
hex_value = atoi(switch_value("character"));
type = hex;
}
if (switch_present("whole"))
type = whole;
if (switch_present("line"))
type = line;
if (switch_present("dash")) {
dash_len = atoi(switch_value("dash"));
type = dash;
}
if (switch_present("equal")) {
dash_len = atoi(switch_value("equal"));
type = equal;
}
if (switch_present("word_length")) {
max_word = atoi(switch_value("word_length"));
}
if (switch_present("field")) {
type = field;
field_pos = atoi(switch_value("field.position"));
field_size = atoi(switch_value("field.size"));
}
strcpy(punctuation, PUNCT_CHARS); /* default for /punctuation */
if (switch_present("punctuation")) {
strcpy(temp_punct, switch_value("punctuation"));
if (temp_punct[0] == '"') { /* if quoted string */
strncpy(punctuation, temp_punct+1, leng-2);
punctuation[leng-2] = '\0';
}
else if (strlen(temp_punct) > 0)
strcpy(punctuation, temp_punct);
}
else { /* /nopunctuation="$" means exclude $ from punct chars */
if (cp = switch_value("punctuation"))
strcpy(temp_punct, cp);
else
strcpy(temp_punct, "");
if (temp_punct[0] == '"') { /* if quoted string */
strcpy(temp_punct, temp_punct+1);
temp_punct[leng-2] = '\0';
}
for (cp = temp_punct; *cp; cp++) {
cp2 = strchr(punctuation, *cp);
if (cp2)
strcpy(cp2, cp2+1); /* remove character from punctuation */
}
}
if (switch_present("max_topics")) {
max_topic = max_count = atoi(switch_value("max_topics")); /* query assumes topic = count */
if (max_topic > 9) {
printf("/MAX_TOPICS specifies the number of digits in the topic number field.\n");
printf("A 32 bit system cannot handle integers greater than 9 digits.\n");
exit(9);
}
}
if (switch_present("minimum_word"))
minimum_word = atoi(switch_value("minimum_word"));
sequential = (switch_present("sequential"));
strcpy(file_arg, switch_value("file")); /* get source */
file_dsc.dsc$w_length = (short) strlen(file_arg); /* set the descriptor length */
strncpy(file_spec, "", sizeof(file_spec)); /* clear out file_spec */
status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0);
if ((status & 1) == 0) {
perror("lib$find_file failed");
exit(11);
}
ptr = strchr(file_spec, ' ');
if (ptr)
*ptr = '\0'; /* chop off trailing spaces */
strcpy(out_name, file_spec); /* make copy for output spec */
if (switch_present("output")) /* if /output, overwrite out_name */
strcpy(out_name, switch_value("output"));
words_size = words_index = 0; /* no words yet */
words = NULL;
noise_size = noise_index = 0; /* no noise yet */
noise = NULL;
candidate_size = candidate_index = 0; /* no candidate yet */
candidate = NULL;
if (!switch_present("link")) {
load_words("noise", punctuation, &noise, &noise_size, &noise_index);
load_words("candidates", punctuation, &candidate, &candidate_size, &candidate_index);
}
ptr = strrchr(out_name, '.'); /* just get file name */
if (ptr) *ptr = '\0';
if (sequential)
strcat(out_name, ".seqidx");
else
strcat(out_name, ".idx");
if (!switch_present("link")) {
if (((status = sys$create(&idxfab)) & 1) != SS$_NORMAL)
lib$stop(status);
if (((status = sys$connect(&idxrab)) & 1) != SS$_NORMAL)
lib$stop(status);
}
ptr = strrchr(out_name, '.'); /* just get file name */
if (ptr) *ptr = '\0';
if (sequential)
strcat(out_name, ".seqsel");
else
strcat(out_name, ".sel");
while (fgets(src_line, sizeof(src_line), src)) {
/* if the first character of the line is the hex value, end topic */
if ((src_line[0] == hex_value) && (type == hex)) {
write_words(src, lnk, &selrab, &idxrab, &start_pos,
&end_pos, desc, topics);
continue;
}
ptr = strchr(src_line, '\n');
if (ptr) *ptr = '\0'; /* remove newline */
for (ptr = src_line; *ptr; ptr++)
if (iscntrl(*ptr)) *ptr = ' '; /* convert tabs to spaces */
while ((strlen(src_line) > 0) &&
(src_line[strlen(src_line)-1] == ' '))
src_line[strlen(src_line)-1] = '\0';/* remove trailing blanks */
strcpy(orig_line, src_line); /* copy before forcing lower case */
for (ptr = src_line; *ptr; ptr++)
*ptr = _tolower(*ptr); /* force lowercase */
strcpy(lc_line, src_line); /* copy with leading blanks */
for (ptr = src_line; *ptr; ptr++)
if (*ptr > ' ') break; /* find first non-blank char */
strcpy(src_line, ptr); /* remove leading blanks */
/* break on dashes */
if (((type == equal) || (type == dash)) &&
(strncmp(orig_line, dashes, dash_len) == 0)) {
write_words(src, lnk, &selrab, &idxrab, &start_pos,
&end_pos, desc, topics);
continue;
}
/* break on paragraph */
if ((type == para) && (strlen(src_line) == 0)) {
write_words(src, lnk, &selrab, &idxrab, &start_pos,
&end_pos, desc, topics);
continue;
}
/* break on non-empty field */
if ((type == field) && !is_spaces(orig_line, field_pos, field_size)) {
write_words(src, lnk, &selrab, &idxrab, &start_pos,
&end_pos, desc, topics);
start_pos = end_pos; /* don't skip over line with field break */
}
/* save the first line by default */
if ((switch_present("default_topic")) && (strlen(desc) == 0))
strncpy(desc, orig_line, DESC_SIZE);
if (switch_present("selector.text") &&
(where = find_str(lc_line, switch_value("selector.text")))) {
selector_spec = (char *) my_realloc((char *) selector_spec,
strlen(orig_line) + 1);
strcpy(selector_spec, orig_line + where - 1 +
strlen(switch_value("selector.text")));
if (switch_present("selector.end") && /* if selector=end given */
(where = find_str(selector_spec, switch_value("selector.end"))))
selector_spec[where - 1] = '\0'; /* mark selector end */
while (*selector_spec == ' ') /* remove leading spaces */
strcpy(selector_spec, selector_spec + 1);
continue; /* do not index this line */
}
else if (selector_spec == NULL)
selector_spec = (char *) calloc(1, sizeof(char)); /* make empty spec */
for (index = 0; topics[index].used; index++) { /* apply topic rules */
where = topics[index].pos; /* where text is found */
/* if text matches the source text and position */
if (
((topics[index].pos > 0) && topics[index].text
&& strncmp(lc_line + topics[index].pos - 1,
topics[index].text, strlen(topics[index].text)) == 0)
|| /* or position = 0 and text is found _somewhere_ */
((topics[index].pos == 0)
&& (where = find_str(lc_line, topics[index].text)))
|| /* or no text given but position and size field is non-blank */
(!topics[index].text &&
!is_spaces(orig_line, topics[index].pos, topics[index].size))
) {
/* if topic matches and requested a break, do it */
if (topics[index].force_break) {
write_words(src, lnk, &selrab, &idxrab, &start_pos,
&end_pos, desc, topics);
start_pos = end_pos; /* don't skip over topic line */
}
if (topics[index].exclude)
strcpy(orig_line + where - 1,
orig_line + where - 1 + strlen(topics[index].text));
topics[index].found = (char *) my_realloc((char *) topics[index].found,
(topics[index].size ? topics[index].size : strlen(orig_line))
+ 1);
if (topics[index].size > 0) {
strncpy(topics[index].found, orig_line
+ where - 1 + topics[index].offset,
topics[index].size);
topics[index].found[topics[index].size] = '\0';
strncat(topics[index].found, spaces,
topics[index].size - strlen(topics[index].found));
}
else {
strcpy(topics[index].found, orig_line
+ where - 1 + topics[index].offset);
if (where = find_str(topics[index].found, topics[index].end))
topics[index].found[where - 1] = '\0'; /* terminate the found string */
}
break; /* a line satisfies only one topic rule */
}
}
if (!switch_present("link"))
test_words(src_line, punctuation, minimum_word, keywords);
end_pos = ftell(src); /* end_pos points before any terminator */
if (type == line)
write_words(src, lnk, &selrab, &idxrab, &start_pos,
&end_pos, desc, topics);
}
/* in case file doesn't end with a terminator */
write_words(src, lnk, &selrab, &idxrab, &start_pos, &end_pos,
desc, topics);
fclose(src);
status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0);
if ((status & 1) == 0) {
lib$find_file_end(&context);
break;
}
ptr = strchr(file_spec, ' ');
if (ptr) *ptr = '\0'; /* chop off trailing spaces */
}
if (switch_present("link"))
fclose(lnk);
else {
sys$close(&selfab);
sys$close(&idxfab);
}
}
/* test for start/end of keyword indexing */
void test_words(char *line, char *punct, int minimum_word, topic_str *keywords)
{
char test_line[MAX_INPUT_LINE], copy_line[MAX_INPUT_LINE];
int ind, where, retry;
if (!switch_present("keyword")) { /* no /keyword= */
build_words(line, punct, minimum_word); /* index everything */
return;
}
strcpy(test_line, line); /* copy source line */
do {
where = 0;
retry = FALSE;
if (keyword_index == -1) { /* between keywords */
for (ind = 0; keywords[ind].used; ind++)
if (where = find_str(test_line, keywords[ind].text))
break;
if (where) {
keyword_index = ind; /* record current keyword */
strcpy(test_line, test_line + where - 1 +
keywords[ind].offset); /* remove up to keyword */
if (keywords[ind].exclude)
strcpy(test_line, test_line + strlen(keywords[ind].text));
}
}
if (keyword_index > -1) /* in keyword index */
if (where = find_str(test_line, keywords[keyword_index].end)) {
strcpy(copy_line, test_line);
test_line[where - 1] = '\0';
build_words(test_line, punct, minimum_word); /* index contents of line */
strcpy(test_line, copy_line + where - 1); /* restart at end word */
keyword_index = -1; /* no longer indexing */
retry = TRUE; /* check for another keyword */
}
else { /* indexing and no end word found */
build_words(test_line, punct, minimum_word); /* index contents of line */
return;
}
} while (retry);
}
/* break line into words and save them in words[] */
/* write out the index entries */
write_index(int db_index, struct RAB *idxptr)
{
int ind, status, dup_count;
/* write out the words */
/* sort keys for counts and $put performance */
qsort(words, words_index, sizeof(char *), compare);
dup_count = 1;
strcpy(prev_keyword, words[0]);
for (ind = 1; ind < words_index; ind++)
if (strcmp(words[ind], prev_keyword) == 0)
dup_count++;
else {
sprintf(idx_record, "%-*s%0*d%0*d",
max_word, prev_keyword,
max_topic, db_index,
max_count, dup_count);
status = sys$put(idxptr);
if ((status & 1) != SS$_NORMAL)
lib$stop(status);
strcpy(prev_keyword, words[ind]);
dup_count = 1;
}
/* write out the last word */
sprintf(idx_record, "%-*s%0*d%0*d",
max_word, prev_keyword,
max_topic, db_index,
max_count, dup_count);
status = sys$put(idxptr);
if ((status & 1) != SS$_NORMAL)
lib$stop(status);
}
void reset_topic(FILE *src, char *desc, int *start_pos, topic_str *topics)
{
int ind;
strncpy(desc, "", sizeof(desc));
*start_pos = ftell(src); /* init start position */
for (ind = 0; ind < words_index; ind++) /* clear out words[] */
*words[ind] = '\0';
words_index = 0;
for (ind = 0; ind < TOPIC_SIZE; ind++)
if (topics[ind].found)
*topics[ind].found = '\0';
}
/* write out Gopher command, write out words */
void write_words(FILE *src, FILE *lnk, struct RAB *selptr, struct RAB *idxptr,
int *start_pos, int *end_pos, char *desc, topic_str *topics)
{
static int db_index = 0;
int ind, status;
char filename[256], *ptr, temp_desc[512] = "";
char hostname[100], portname[10];
keyword_index = -1; /* stop indexing at end of section */
if (!switch_present("link")) {
if (words_index == 0) {
reset_topic(src, desc, start_pos, topics);
return; /* no words to write */
}
if ((db_index + 2) >= power(10, max_topic)) {
printf("You have reached %d topics in this index\n", db_index);
printf("Please re-index with /MAX_TOPIC larger than %d\n", max_topic);
exit(5);
}
}
fgetname(src, filename);
if (!switch_present("version")) { /* if /noversion */
ptr = strchr(filename, ';'); /* get rid of version number */
if (ptr) *ptr = '\0';
}
for(ptr = filename; *ptr; ptr++)
*ptr = _tolower(*ptr); /* force filename lowercase */
for (ind = 0; ind < TOPIC_SIZE; ind++)
if (topics[ind].found && (strlen(topics[ind].found) > 0)) {
if (strlen(temp_desc) > 0)
strcat(temp_desc, " ");
strcat(temp_desc, topics[ind].found);
}
if (strlen(temp_desc) > 0)
strncpy(desc, temp_desc, DESC_SIZE);
if (strlen(desc) > 0) { /* no description, no index */
if (switch_present("selector.text") || strlen(selector_spec)) {
if (!switch_present("selector.both") && /* selectors only */
(strlen(selector_spec) == 0)) { /* and no selector found */
reset_topic(src, desc, start_pos, topics); /* reset topic stuff */
return; /* and quit */
}
if (switch_present("selector.ignore") && /* should we ignore this? */
(strchr(switch_value("selector.ignore"), *selector_spec))) {
reset_topic(src, desc, start_pos, topics);
return; /* yes, ignore this */
}
strncpy(hostname, "", sizeof(hostname)); /* init hostname */
strncpy(portname, "", sizeof(portname)); /* init portname */
if (ptr = strchr(selector_spec, '|')) {
*ptr = '\0'; /* mark off selector from host */
sprintf(hostname, "\t%s", ptr + 1); /* copy host/port */
if (ptr = strchr(hostname, '|')) {
*ptr = '\0'; /* mark off port from host */
sprintf(portname, "\t%s", ptr + 1);
}
}
if (switch_present("link")) {
fprintf(lnk, "Name=%s\nType=%c\n", desc, *selector_spec);
fprintf(lnk, "Path=%s\n", selector_spec+1);
fprintf(lnk, "Port=%s\nHost=%s\n\n",
strlen(hostname) ? hostname + 1 : "+",
strlen(portname) ? portname + 1 : "+");
}
else {
sprintf(sel_record, "%0*d%c%s\t%s%s%s",
max_topic, ++db_index, *selector_spec, desc,
selector_spec+1, hostname, portname);
selptr->rab$w_rsz = strlen(sel_record);
if (((status = sys$put(selptr)) & 1) != SS$_NORMAL)
lib$stop(status);
write_index(db_index, idxptr);
}
strcpy(selector_spec, ""); /* reset the current selector */
}
if (!switch_present("selector.text") || switch_present("selector.both")) {
if (!switch_present("link")) {
/* write out the selector */
if (switch_present("whole")) /* whole file is a special case */
sprintf(sel_record, "%0*d0%s\t0%s",
max_topic, ++db_index, desc, filename);
else
sprintf(sel_record, "%0*d0%s\tR%d-%d-%s",
max_topic, ++db_index, desc, *start_pos, *end_pos,
filename);
selptr->rab$w_rsz = strlen(sel_record);
if (((status = sys$put(selptr)) & 1) != SS$_NORMAL)
lib$stop(status);
write_index(db_index, idxptr);
}
else {
fprintf(lnk, "Name=%s\nType=0\n", desc);
fprintf(lnk, "Path=R%d-%d-%s\n", *start_pos, *end_pos, filename);
fprintf(lnk, "Port=+\nHost=+\n\n");
}
}
printf("%s\n", desc);
}
reset_topic(src, desc, start_pos, topics); /* clear words[], topics */
}
/* read in a file of words */
void load_words(char *name, char *punct, char ***table, int *table_size, int *table_index)
{
FILE *nf;
char *cp, *cp2, line[MAX_INPUT_LINE];
static char file_name[256];
short leng;
int status;
$DESCRIPTOR(name_dsc, file_name);
if (switch_present(name)) {
strcpy(file_name, switch_value(name));
if ((nf = fopen(file_name, "r")) == NULL) {
printf("Can't read data file %s\n", file_name);
return;
}
}
else if (strcmp(name, "noise"))
return;
else if ((nf = fopen("_noise_words", "r", "dna = gopher_root:[000000].dat")) == NULL)
return;
while (fgets(line, sizeof(line), nf)) {
cp = strchr(line, '\n');
if (cp) *cp = '\0'; /* remove newline */
for (cp = line; *cp; cp++) {
if (is_punct(*cp, punct) || iscntrl(*cp))
*cp = ' '; /* convert punctuation, tabs to spaces */
*cp = _tolower(*cp); /* force lowercase */
}
while ((strlen(line) > 0) &&
(line[strlen(line)-1] == ' '))
line[strlen(line)-1] = '\0'; /* remove trailing blanks */
for (cp = line; *cp; cp++)
if (*cp > ' ') break; /* find first non-blank char */
strcpy(line, cp); /* remove leading blanks */
strcat(line, " "); /* line ends with a space */
cp = line;
while(cp2 = strchr(cp, ' ')) { /* break at space boundary */
*cp2 = '\0';
if (strlen(cp) > 0) {
if (*table_index == *table_size) /* table full */
expand_table(table, table_size);
strcpy((*table)[(*table_index)++], cp);
}
cp = cp2 + 1;
}
}
fclose(nf);
}
/* see if a char is punctuation */
int is_punct(char ch, char *punct)
{
char *ptr;
for (ptr = punct; *ptr; ptr++)
if (*ptr == ch)
return TRUE;
return FALSE;
}
/* see if field is spaces */
int is_spaces(char *line, int pos, int size)
{
int index;
if (strlen(line) < pos)
return(TRUE);
for (index = 0; index < size; index++)
if (!isspace(line[pos + index - 1]))
return(FALSE);
return(TRUE);
}
/* see if the word is noise */
int is_noise(char *word, int size, int minimum_word)
{
int ind;
if (strlen(word) < minimum_word) /* simple heuristic saves lots of noise entries */
return(TRUE);
if ((!switch_present("numbers")) && isdigit(*word))
return(TRUE);
for(ind = 0; ind < size; ind++) {
if (noise[ind] == NULL) return (FALSE);
if (strcmp(noise[ind], word) == 0)
return (TRUE);
}
return (FALSE);
}
/* see if the word is candidate */
int is_candidate(char *word, int size)
{
int ind;
for(ind = 0; ind < size; ind++) {
if (candidate[ind] == NULL) return (FALSE);
if (strcmp(candidate[ind], word) == 0)
return (TRUE);
}
return (FALSE);
}
if (++cur_string >= N_STRING) cur_string = 0;
for (cp = strings[cur_string];; cp++) {
*cp = tolower(*str++);
if (*cp == '\0') break;
}
return strings[cur_string];
}
/* find where the string starts (origin 1) in record */
int find_str(char *record, char *str)
{
char *cp, *lcr;
if ((str == NULL) || (strlen(str) == 0))
return (0); /* zero means string not found */
lcr = lc(record);
cp = strstr(lcr, lc(str));
if (cp == NULL) return (0);
return (cp - lcr + 1);
}
/* descr() creates character descriptor and return the address
of the descriptor to the caller. */
# define N_DESCR 10
static struct dsc$descriptor_s str_desc[N_DESCR];
static int cur_descr = -1;
do
ptr++;
while ((*ptr != '=') /* skip to the keyword/parameter */
&& (*ptr != ':')); /* separator character */
while (isspace(*++ptr)); /* skip spaces */
if (*ptr == '"') { /* if quoted string */
start = ++ptr; /* skip over quote */
for (; *ptr; ptr++) { /* skip to ending quote */
if ((*ptr == '"') && (*(ptr+1) == '"')) { /* doubled quotes? */
ptr++; /* yes, skip it */
continue;
}
if (*ptr == '"') /* un-doubled quote? */
break; /* yes, stop here */
}
}
else { /* else non-quoted string */
start = ptr; /* start of string */
while (*ptr
&& (*ptr != ' ')
&& (*ptr != ',')
&& (*ptr != '/')
&& (*ptr != ')'))
ptr++; /* skip to string terminator */
}
*dest = (char *) calloc((ptr - start) + 1, sizeof(char));
strncpy(*dest, start, ptr - start);
for (cp = *dest; *cp; *cp++) /* collapse doubled quotes to single */
if ((*cp == '"') && (*(cp+1) == '"'))
strcpy(cp, cp+1);
return (ptr);
}
/* parse command line for /topic */
void parse_topic(char *line, topic_str *topics)
{
char *ptr, *start, **dest;
static int index = -1;
for (ptr = line; *ptr; ptr++)
*ptr = _tolower(*ptr); /* force command line lowercase */
ptr = line; /* point to start of line */
for (;;) { /* search for /topic until end of line */
if (index == TOPIC_SIZE)
return; /* exit if we can't hold any more */
ptr = strchr(ptr, '/'); /* search for switch start */
if (ptr == NULL)
return; /* no more switches */
while (isspace(*++ptr)); /* skip spaces */
if (*ptr != 't') /* topic is unique to one character */
continue; /* not /topic, keep scanning */
do
ptr++;
while ((*ptr != '=') /* skip to the keyword/parameter */
&& (*ptr != ':')); /* separator character */
while (isspace(*++ptr)); /* skip spaces */
if (*ptr == '(') /* if start of list */
while (isspace(*++ptr)); /* skip spaces */
index++; /* next topics structure */
topics[index].used = TRUE; /* this topic index is used */
topics[index].pos = DEFAULT_POS; /* default position */
topics[index].end = "</"; /* default end to HTML end tag */
for (;;) { /* parse all /topic list elements */
if (*ptr == 'e') /* "end" or "exclude" */
ptr++; /* so we match on 'n' or 'x' */
switch (*ptr) {
case 't': /* text */
ptr = get_text(&topics[index].text, ptr);
break;
case 'n': /* end */
get_text(&topics[index].end, ptr);
break;
case 'p': /* position */
topics[index].pos = get_decimal(ptr);
break;
case 's': /* size */
topics[index].size = get_decimal(ptr);
break;
case 'o': /* offset */
topics[index].offset = get_decimal(ptr);
break;
case 'x': /* exclude */
topics[index].exclude = TRUE; /* has no parameters */
break;
case 'b': /* break */
topics[index].force_break = TRUE; /* has no parameters */
break;
}
while (*ptr
&& (*ptr != ' ') /* skip to end of */
&& (*ptr != ',') /* keyword */
&& (*ptr != '/') /* switch */
&& (*ptr != ')')) /* or parameter */
ptr++;
while (*ptr &&
((*ptr <= ' ') || /* skip spaces, junk */
(*ptr == ','))) /* list seperators */
ptr++;
if (*ptr == '\0')
return; /* end of the line */
if ((*ptr == ')') || (*ptr == '/'))
break; /* end of the list */
} /* scan for more list elements */
}
}
/* parse command line for /keyword */
void parse_keyword(char *line, topic_str *keywords)
{
char *ptr, *start, **dest;
static int index = -1;
for (ptr = line; *ptr; ptr++)
*ptr = _tolower(*ptr); /* force command line lowercase */
ptr = line; /* point to start of line */
for (;;) { /* search for /keyword until end of line */
if (index == TOPIC_SIZE)
return; /* exit if we can't hold any more */
ptr = strchr(ptr, '/'); /* search for switch start */
if (ptr == NULL)
return; /* no more switches */
while (isspace(*++ptr)); /* skip spaces */
if (*ptr != 'k') /* keyword is unique to one character */
continue; /* not /keyword, keep scanning */
do
ptr++;
while ((*ptr != '=') /* skip to the keyword/parameter */
&& (*ptr != ':')); /* separator character */
while (isspace(*++ptr)); /* skip spaces */
if (*ptr == '(') /* if start of list */
while (isspace(*++ptr)); /* skip spaces */
index++; /* next keywords structure */
keywords[index].used = TRUE; /* this keyword index is used */
for (;;) { /* parse all /keyword list elements */
if (*ptr == 'e') /* "end" or "exclude" */
ptr++; /* so we match on 'n' or 'x' */
switch (*ptr) {
case 't': /* text */
ptr = get_text(&keywords[index].text, ptr);
break;
case 'n': /* end */
get_text(&keywords[index].end, ptr);
break;
case 'x': /* exclude */
keywords[index].exclude = TRUE; /* has no parameters */
break;
}
while (*ptr
&& (*ptr != ' ') /* skip to end of */
&& (*ptr != ',') /* keyword */
&& (*ptr != '/') /* switch */
&& (*ptr != ')')) /* or parameter */
ptr++;
while (*ptr &&
((*ptr <= ' ') || /* skip spaces, junk */
(*ptr == ','))) /* list seperators */
ptr++;
if (*ptr == '\0')
return; /* end of the line */
if ((*ptr == ')') || (*ptr == '/'))
break; /* end of the list */
} /* scan for more list elements */
}
}
void parse_switch(char *name, switch_str sw[], int def_sw, char *def_value)
{
int status, ind;
short leng;
static char value[100];
$DESCRIPTOR(switch_dsc, value);
for (ind = 0; sw[ind].name; ind++) /* find end of sw[] */
if (strcmp(sw[ind].name, name) == 0) /* or a pre-existing switch */
break;
if (sw[ind].name == NULL) { /* register name first time */
sw[ind].name = calloc(strlen(name) + 1, sizeof(char));
strcpy(sw[ind].name, name);
sw[ind].state = def_sw; /* and set its default state */
}
if (def_sw) /* if default on, remember if it's turned off */
sw[ind].state &= (cli$present(descr(name)) & 1);
else /* if default off, remember if it's turned on */
sw[ind].state |= (cli$present(descr(name)) & 1);
status = cli$get_value(descr(name), &switch_dsc, &leng);
if (status & 1) {
value[leng] = '\0';
/* if this is the first value or a non-default value, save it */
if ((sw[ind].value == NULL) || strcmp(def_value, value)) {
if (*value == '"') { /* remove quotes from quoted string */
strncpy(value, value + 1, leng - 2);
value[leng - 2] = '\0';
}
sw[ind].value = calloc(strlen(value) + 1, sizeof(char));
strcpy(sw[ind].value, value);
}
}
else /* hack around CLI bug that doesn't do selector.end right */
if (def_value) { /* no value, but default exists */
sw[ind].value = calloc(strlen(def_value) + 1, sizeof(char));
strcpy(sw[ind].value, def_value);
sw[ind].state = def_sw;
}
}