/* index qi data file */
/* Bruce Tanner - Cerritos College */
/* 1.0 1993/08/14 Start with build_index */
/* 1.1 1993/08/30 Make fopen failure more explicit */
/* 1.2 1993/09/04 Move soundex creation outside */
#include ssdef
#include stdio
#include string
#include ctype
#include rms
#include descrip
#include climsgdef
#include assert
#include "qi.h"
char idx_record[IDX_RECORD_SIZE + 1];
char idx_key[IDX_KEY_SIZE + 1];
char dat_record[DAT_RECORD_SIZE + 1];
char dat_key[DAT_KEY_SIZE + 1];
int field_attrib[MAX_FIELD];
int mode = 0;
#define CREATE 1
#define MERGE 2
void read_fields(char *);
void index_words(char *, struct RAB *, struct RAB *);
struct dsc$descriptor_s *descr(char *);
void build_commands();
int lib$get_foreign(), lib$get_input();
main(int argc, char *argv[])
{
FILE *src;
char cli_input[256], file_arg[256], file_spec[256];
char idx_name[256], dat_name[256];
char *ptr, field[DATA_SIZE + 1];
char dat_copy[DAT_RECORD_SIZE + 1];
int status, context = 0, count = 0;
short leng;
struct FAB idxfab, datfab;
struct RAB idxrab, datrab;
struct XABKEY idxxab, datxab;
$DESCRIPTOR(input_dsc, cli_input);
$DESCRIPTOR(file_dsc, file_arg);
$DESCRIPTOR(file_spec_dsc, file_spec);
$DESCRIPTOR(idx_dsc, idx_name);
status = lib$get_foreign(&input_dsc, 0, &leng, 0);
strncpy(cli_input+6, cli_input, leng);
strncpy(cli_input, "build ", 6);
status = cli$dcl_parse(&input_dsc, build_commands, lib$get_input);
if (status != CLI$_NORMAL) /* error in parse, exit */
exit(1);
if ((cli$present(descr("file")) & 1) == 0) {
printf("Usage: build data_file /data/create/merge/config=.../output=...\n");
exit(3);
}
status = cli$get_value(descr("file"), &file_dsc, &leng); /* get source */
status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0);
ptr = strchr(file_spec, ' ');
if (ptr) *ptr = '\0'; /* chop off trailing spaces */
strcpy(idx_name, file_spec); /* make copy for output spec */
if (cli$present(descr("output")) & 1) { /* if /output, overwrite out_name */
status = cli$get_value(descr("output"), &idx_dsc, &leng);
idx_name[leng] = '\0';
}
if (cli$present(descr("create")) & 1)
mode = CREATE;
if (cli$present(descr("merge")) & 1)
mode = MERGE;
ptr = strrchr(idx_name, '.'); /* just get file name */
if (ptr) *ptr = '\0';
strcat(idx_name, ".INDEX");
idxfab = cc$rms_fab;
idxfab.fab$b_bks = 6;
idxfab.fab$b_fac = FAB$M_GET | FAB$M_PUT;
idxfab.fab$l_fna = idx_name;
idxfab.fab$b_fns = strlen(idx_name);
idxfab.fab$l_fop = FAB$M_CBT | FAB$M_DFW;
idxfab.fab$w_mrs = IDX_RECORD_SIZE;
idxfab.fab$b_org = FAB$C_IDX;
idxfab.fab$b_rat = FAB$M_CR;
idxfab.fab$b_rfm = FAB$C_FIX;
idxfab.fab$b_shr = FAB$M_NIL;
idxfab.fab$l_xab = &idxxab;
idxrab = cc$rms_rab;
idxrab.rab$l_fab = &idxfab;
idxrab.rab$b_krf = 0;
idxrab.rab$l_kbf = idx_key;
idxrab.rab$b_ksz = IDX_KEY_SIZE;
idxrab.rab$b_rac = RAB$C_KEY;
idxrab.rab$l_rbf = idx_record;
idxrab.rab$w_rsz = IDX_RECORD_SIZE;
idxrab.rab$l_ubf = idx_record;
idxrab.rab$w_usz = IDX_RECORD_SIZE;
idxrab.rab$b_mbf = 20;
idxrab.rab$l_rop = RAB$M_RAH | RAB$M_WBH;
idxxab = cc$rms_xabkey;
idxxab.xab$b_dtp = XAB$C_STG;
idxxab.xab$b_flg = XAB$M_IDX_NCMPR;
idxxab.xab$w_pos0 = 0;
idxxab.xab$b_siz0 = IDX_KEY_SIZE;
idxxab.xab$b_ref = 0;
strcpy(dat_name, idx_name);
ptr = strrchr(dat_name, '.'); /* just get file name */
if (ptr) *ptr = '\0';
strcat(dat_name, ".DATA");
datfab = cc$rms_fab;
datfab.fab$b_bks = 9;
datfab.fab$b_fac = FAB$M_GET | FAB$M_PUT | FAB$M_UPD;
datfab.fab$l_fna = dat_name;
datfab.fab$b_fns = strlen(dat_name);
datfab.fab$l_fop = FAB$M_CBT | FAB$M_DFW;
datfab.fab$w_mrs = DAT_RECORD_SIZE;
datfab.fab$b_org = FAB$C_IDX;
datfab.fab$b_rat = FAB$M_CR;
datfab.fab$b_rfm = FAB$C_VAR;
datfab.fab$b_shr = FAB$M_NIL;
datfab.fab$l_xab = &datxab;
datrab = cc$rms_rab;
datrab.rab$l_fab = &datfab;
datrab.rab$b_krf = 0;
datrab.rab$l_kbf = dat_key;
datrab.rab$b_ksz = DAT_KEY_SIZE;
datrab.rab$b_rac = RAB$C_KEY;
datrab.rab$l_rbf = dat_record;
datrab.rab$b_mbf = 20;
datrab.rab$l_rop = RAB$M_RAH | RAB$M_WBH;
datxab = cc$rms_xabkey;
datxab.xab$b_dtp = XAB$C_STG;
datxab.xab$b_flg = XAB$M_DAT_NCMPR | XAB$M_IDX_NCMPR;
datxab.xab$w_pos0 = 0;
datxab.xab$b_siz0 = DAT_KEY_SIZE;
datxab.xab$b_ref = 0;
/* open index file */
if (mode == CREATE)
if (((status = sys$create(&idxfab)) & 1) != SS$_NORMAL)
lib$stop(status);
if (mode == MERGE)
if (((status = sys$open(&idxfab)) & 1) != SS$_NORMAL)
lib$stop(status);
if (((status = sys$connect(&idxrab)) & 1) != SS$_NORMAL)
lib$stop(status);
/* open data file */
if (cli$present(descr("DATA")) & 1) {
if (mode == CREATE)
if (((status = sys$create(&datfab)) & 1) != SS$_NORMAL)
lib$stop(status);
if (mode == MERGE)
if (((status = sys$open(&datfab)) & 1) != SS$_NORMAL)
lib$stop(status);
if (((status = sys$connect(&datrab)) & 1) != SS$_NORMAL)
lib$stop(status);
}
/* record the fields with Indexed attribute */
read_fields(file_spec);
for (;;) { /* process all files in input spec, first one already found */
if ((src = fopen(file_spec, "r", "mbc=50", "mbf=20")) == NULL) {
printf("Can't read input file %s\n", file_spec);
exit(5);
}
printf("Building index for %s\n", file_spec);
while (fgets(dat_record, sizeof(dat_record), src)) {
if ((ptr = strchr(dat_record, '\r')) ||
(ptr = strchr(dat_record, '\n')))
*ptr = '\0'; /* remove newline */
if (strlen(dat_record) == 0)
continue; /* skip blank lines */
if ((++count % 500) == 0)
printf("%d\n", count);
/* if /DATA requested, write .data file record */
if (cli$present(descr("data")) & 1) {
strncpy(dat_key, dat_record, DAT_KEY_SIZE);
datrab.rab$w_rsz = strlen(dat_record);
if ((status = sys$put(&datrab)) != RMS$_NORMAL) {
if ((status == RMS$_DUP) && (mode == MERGE)) {
status = sys$find(&datrab);
status = sys$update(&datrab); /* update the record */
}
if (status != RMS$_NORMAL) {
printf("DATA key (%d chars) %s\n", strlen(dat_key), dat_key);
printf("DATA rec (%d chars) %s\n", strlen(dat_record), dat_record);
lib$stop(status);
}
}
}
strcpy(dat_copy, dat_record);
/* if this is an indexed field, write index record(s) */
strncpy(field, dat_copy + ID_SIZE, FIELD_SIZE);
field[FIELD_SIZE] = '\0';
if (field_attrib[atoi(field)] & ATTR_INDEXED) {
for (ptr = dat_copy; *ptr; ptr++)
if (iscntrl(*ptr)) *ptr = ' '; /* convert tabs to spaces */
while ((strlen(dat_copy) > 0) &&
(dat_copy[strlen(dat_copy)-1] == ' '))
dat_copy[strlen(dat_copy)-1] = '\0';/* remove trailing blanks */
for (ptr = dat_copy; *ptr; ptr++)
*ptr = _tolower(*ptr); /* force lowercase */
index_words(dat_copy, &idxrab, &datrab);
}
}
fclose(src);
status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0);
if ((status & 1) == 0) {
lib$find_file_end(&context);
break;
}
ptr = strchr(file_spec, ' ');
if (ptr) *ptr = '\0'; /* chop off trailing spaces */
}
if (cli$present(descr("data")) & 1)
sys$close(&datfab);
sys$close(&idxfab);
}
/* break data field into words and write them to index file */
void index_words(char *line, struct RAB *idxptr, struct RAB *datptr)
{
char data[DATA_SIZE + 2], field[FIELD_SIZE + 1], id[ID_SIZE + 1];
char *cp, *cp2;
int status;
strncpy(id, line, ID_SIZE);
id[ID_SIZE] = '\0';
strncpy(field, line + ID_SIZE, FIELD_SIZE);
field[FIELD_SIZE] = '\0';
strncpy(data, line + ID_SIZE + FIELD_SIZE + SEQ_SIZE + ATTR_SIZE, DATA_SIZE);
data[DATA_SIZE] = '\0';
/* special hack to omit indexing the email domain */
if ((strcmp(field, EMAIL_FIELD) == 0) && (cp = strchr(data, '@')))
*cp = '\0';
#if NAME_HACK
if (strcmp(field, NAME_FIELD) == 0) /* only edit name field */
for (cp = data; *cp; cp++) { /* apply any special editing to names */
if (*cp == '-') *cp = ' '; /* index both hyphenated names */
if (*cp == '\'') strcpy(cp, cp+1); /* squeeze out apostrophe */
}
#endif
strcat(data, " "); /* line ends with a space */
cp = data;
while(cp2 = strchr(cp, ' ')) { /* break at space boundary */
*cp2 = '\0';
if (strlen(cp) > KEYWORD_SIZE)
printf("Truncating %d character word /%s/ to %d characters\n",
strlen(cp), cp, KEYWORD_SIZE);
if (strlen(cp) >= MIN_KEYWORD) {
sprintf(idx_record, "%-*.*s%s%s", KEYWORD_SIZE, KEYWORD_SIZE, cp, field, id);
strncpy(idx_key, idx_record, IDX_KEY_SIZE);
idx_key[IDX_KEY_SIZE] = '\0';
if ((field_attrib[atoi(field)] & ATTR_UNIQUE) &&
((status = sys$get(idxptr)) & 1)) /* unique record found? */
printf("Omit duplicate unique record: %s\n", line);
else {
idxptr->rab$w_rsz = IDX_RECORD_SIZE;
if (((status = sys$put(idxptr)) & 1) == 0)
if (status != RMS$_DUP)
lib$stop(status);
}
}
cp = cp2 + 1;
}
}
char * get_field(char *ptr, char *field)
{
int ind;
for (ind= 0; *ptr != '\0' && *ptr != ':'; ptr++, ind++)
field[ind] = _tolower(*ptr);
field[ind] = '\0';
if (*ptr == ':') ptr++; /* skip over terminating ":" */
return ptr;
}
void read_fields(char *file)
{
FILE *cnf;
char *ptr, config[256], line[256], field[128];
int ind, field_num;
short leng;
$DESCRIPTOR(config_dsc, config);
if (cli$present(descr("configuration")) & 1) { /* if /config */
cli$get_value(descr("configuration"), &config_dsc, &leng);
config[leng] = '\0';
}
else { /* no /config switch */
strcpy(config, file);
ptr = strrchr(config, '.');
if (ptr) *ptr = '\0';
strcat(config,".cnf");
}
for (ind = 0; ind < MAX_FIELD; ind++)
field_attrib[ind] = 0; /* init array */
if ((cnf = fopen(config, "r", "dna=.cnf")) == NULL) {
printf("Can't read config file %s\n", config);
exit(7);
}
while (fgets(line, sizeof(line), cnf)) {
ptr = strchr(line, '\n');
if (ptr) *ptr = '\0'; /* remove newline */
ptr = line;
if ((*ptr == '#') || (*ptr == '\0')) /* comment or blank? */
continue; /* yes, skip line */
ptr = get_field(ptr, field); /* field number */
field_num = atoi(field);
ptr = get_field(ptr, field); /* field name */
ptr = get_field(ptr, field); /* field size */
ptr = get_field(ptr, field); /* field description */
ptr = get_field(ptr, field); /* field option */
for (;;) {
ptr = get_field(ptr, field); /* get attribute */
if (strlen(field) == 0)
break; /* no more attributes */
/* attributes are unique to one letter */
for (ind = 0; ind < MAX_ATTRIBUTES; ind++)
if (field[0] == _tolower(attributes[ind].name[0]))
field_attrib[field_num] |= attributes[ind].value;
}
}
fclose(cnf);
}
/* descr() creates character descriptor and returns
* the address of the descriptor to the caller.
*/
# define N_DESCR 10
static struct dsc$descriptor_s str_desc[N_DESCR];
static int cur_descr = -1;
struct dsc$descriptor_s *descr(char *string)
{
if(++cur_descr >= N_DESCR) cur_descr = 0;
str_desc[cur_descr].dsc$w_length=(short)strlen(string);
str_desc[cur_descr].dsc$b_dtype=DSC$K_DTYPE_T;
str_desc[cur_descr].dsc$b_class=DSC$K_CLASS_S;
str_desc[cur_descr].dsc$a_pointer=string;
return (&str_desc[cur_descr]);
}