/* l2xirexp.c builds recognizers for two kinds of regular expressions */
/* Based on rexpr.c, part of the PCCTS compiler/generator distribution */
/*
* This file contains code for
*
* int rexpr(char *expr, char *s);
*
* which answers
*
* 1 if 's' is in the language described by the regular expression 'expr'
* 0 if it is not
* -1 if the regular expression is invalid
*
* Language membership is determined by constructing a non-deterministic
* finite automata (NFA) from the regular expression. A depth-
* first-search is performed on the NFA (graph) to check for a match of 's'.
* Each non-epsilon arc consumes one character from 's'. Backtracking is
* performed to check all possible paths through the NFA.
*
* Regular expressions follow the meta-language:
*
* <regExpr> ::= <andExpr> ( '|' <andExpr> )*
*
* <andExpr> ::= <expr> ( <expr> )*
*
* <expr> ::= {'~'} '[' <atomList> ']' <repeatSymbol>
* | '(' <regExpr> ')' <repeatSymbol>
* | '{' <regExpr> '}' <repeatSymbol>
* | <atom> <repeatSymbol>
*
* <repeatSymbol> ::= { '*' | '+' }
*
* <atomList> ::= <atom> ( <atom> )*
* | { <atomList> } <atom> '-' <atom> { <atomList> }
*
* <atom> ::= Token[Atom]
*
* Notes:
* ~ means complement the set in [..]. i.e. all characters
* not listed
* * means match 0 or more times (can be on expression
* or atom)
* + means match 1 or more times (can be on expression
* or atom)
* {} optional
* () grouping
* [] set of atoms
* x-y all characters from x to y (found only in [..])
* \xx the character with value xx
*
* Examples:
* [a-z]+
* match 1 or more lower-case letters (e.g. variable)
*
* 0x[0-9A-Fa-f]+
* match a hex number with 0x on front (e.g. 0xA1FF)
*
* [0-9]+.[0-9]+{e[0-9]+}
* match a floating point number (e.g. 3.14e21)
*
* Code example:
* if ( rexpr("[a-zA-Z][a-zA-Z0-9]+", str) ) then str is keyword
*
* Terence Parr
* Purdue University
* April 1991
*/
/**********************************************************************/
/* some modifications by Peter Wilson, Catholic University of America */
/*
* Complement changed from ~[...] to [^...] to match AWK/lex
* Optional changed from {...} to (r)? to match AWK/lex
* Swapped parameters in rexpr()
*
* Major additions/changes to support the EXPRESS LIKE operator:
* string LIKE pattern -> TRUE or FALSE
*
* @ matches any letter
* ^ matches any uppercase letter
* ? matches any character
* & matches remainder of string
* # matches any digit
* $ matches any substring terminated by space or end of string
* * matches any number of characters
* \ escape character
*
*/
/**********************************************************************/
int like_op = 0; /* PRW = 1 iff processing EXPRESS LIKE pattern */
/*
* return 1 if s in language described by expr
* 0 if s is not
* -1 if expr is an invalid regular expression
*/
rexpr(s, expr)
char *expr, *s;
{
NodePtr p,q;
Graph nfa;
int result;
entry_debug("rexpr (l2xirexp.c)");
sprintf(dbuffer, "rexpr(%s, %s);\n", s, expr);
debug_print(dbuffer);
like_op = 0;
freelist = NULL;
_c = expr;
next();
if ( regExpr(&nfa) == -1 ) return -1;
accept = nfa.right;
result = match(nfa.left, s);
/* free all your memory */
p = q = freelist;
while ( p!=NULL ) { q = p->track; free(p); p = q; }
exit_debug("rexpr");
return result;
}
/*
* do a depth-first-search on the NFA looking for a path from start to
* accept state labelled with the characters of 's'.
*/
match(automaton, s)
NodePtr automaton;
char *s;
{
ArcPtr p;
if ( automaton == accept && *s == '\0' ) return 1; /* match */
for (p=automaton->arcs; p!=NULL; p=p->next) /* try all arcs */
{
if ( p->label == Epsilon )
{
if ( match(p->target, s) ) return 1;
}
else if ( p->label == *s )
if ( match(p->target, s+1) ) return 1;
}
return 0;
}
/*
* <regExpr> ::= <andExpr> ( '|' {<andExpr>} )*
*
* Return -1 if syntax error
* Return 0 if none found
* Return 1 if a regExrp was found
*/
static
regExpr(g)
GraphPtr g;
{
Graph g1, g2;
if ( andExpr(&g1) == -1 )
{
return -1;
}
while ( rx_token == '|' )
{
int a;
next();
a = andExpr(&g2);
if ( a == -1 ) return -1; /* syntax error below */
else if ( !a ) return 1; /* empty alternative */
g1 = BuildNFA_AorB(g1, g2);
}
/*
* <atomList> ::= <atom> { <atom> }*
* { <atomList> } <atom> '-' <atom> { <atomList> }
*
* a-b is same as ab
* q-a is same as q
*/
static
atomList(p, complement)
char *p;
int complement;
{
static unsigned char set[256]; /* no duplicates */
int first, last, i;
char *s = p;
if ( rx_token != Atom ) return -1;
for (i=0; i<256; i++) set[i] = 0;
while ( rx_token == Atom )
{
if ( !set[tokchar] ) *s++ = tokchar;
set[tokchar] = 1; /* Add atom to set */
next();
if ( rx_token == '-' ) /* have we found '-' */
{
first = *(s-1); /* Get last char */
next();
if ( rx_token != Atom ) return -1;
else
{
last = tokchar;
}
for (i = first+1; i <= last; i++)
{
if ( !set[tokchar] ) *s++ = i;
set[i] = 1; /* Add atom to set */
}
next();
}
}
*s = '\0';
if ( complement )
{
for (i=0; i<256; i++) set[i] = !set[i];
for (i=1,s=p; i<256; i++) if ( set[i] ) *s++ = i;
*s = '\0';
}
return 1;
}
/* a somewhat stupid lexical analyzer */
static
next()
{
/* PRW seperate scanners for regular expression and LIKE operator */
if (like_op == 0) { /* regex scanning */
while ( *_c==' ' || *_c=='\t' || *_c=='\n' ) _c++;
if ( *_c=='\\' )
{
_c++;
if ( isdigit(*_c) )
{
int n=0;
while ( isdigit(*_c) )
{
n = n*10 + (*_c++ - '0');
}
if ( n>255 ) n=255;
tokchar = n;
}
else
{
switch (*_c)
{
case 'n' : tokchar = '\n'; break;
case 't' : tokchar = '\t'; break;
case 'r' : tokchar = '\r'; break;
default : tokchar = *_c;
}
_c++;
}
rx_token = Atom;
}
else if ( isgraph(*_c) && *_c!='['
&& *_c!='('
&& *_c!='-'
&& *_c!=')'
&& *_c!=']'
&& *_c!='?'
&& *_c!='+'
&& *_c!='*'
&& *_c!='^'
&& *_c!='|' )
{ /* PRW deleted {, } and ~ while adding ^ and ? */
rx_token = Atom;
tokchar = *_c++;
}
else
{
rx_token = tokchar = *_c++;
}
return;
} /* end of regex scanner */
else if (like_op == 1) { /* LIKE scanning */
while ( *_c==' ' || *_c=='\t' || *_c=='\n' ) _c++;
if ( *_c=='\\' )
{
_c++;
switch (*_c)
{
case 'n' : tokchar = '\n'; break;
case 't' : tokchar = '\t'; break;
case 'r' : tokchar = '\r'; break;
default : tokchar = *_c;
}
_c++;
rx_token = Atom;
}
else if ( isgraph(*_c) && *_c!='@'
&& *_c!='^'
&& *_c!='?'
&& *_c!='&'
&& *_c!='#'
&& *_c!='$'
&& *_c!='*'
&& *_c!='!' )
{
rx_token = Atom;
tokchar = *_c++;
}
else
{
rx_token = tokchar = *_c++;
}
return;
} /* end of LIKE scanner */
}
/* N F A B u i l d i n g R o u t i n e s */
static
ArcPtr
newGraphArc()
{
ArcPtr p;
p = (ArcPtr) calloc(1, sizeof(Arc));
if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
if ( freelist != NULL ) p->track = (ArcPtr) freelist;
freelist = (NodePtr) p;
return p;
}
static
NodePtr
newNode()
{
NodePtr p;
p = (NodePtr) calloc(1, sizeof(Node));
if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
if ( freelist != NULL ) p->track = freelist;
freelist = p;
return p;
}
static
ArcBetweenGraphNodes(i, j, label)
NodePtr i, j;
int label;
{
ArcPtr a;
while (rx_token != '\0') {
if (likeExpr(&g2) == -1) {
return(-1);
}
g1 = BuildNFA_AB(g1, g2);
}
*g = g1;
return(1);
} /* end LIKEPAT */
/************************************************************************/
/************************************************************************/
/* like_expr(expr,s) */
/* returns -1 if expr not an EXPRESS LIKE pattern */
/* 0 if s does not match pattern expr */
/* 1 if s matches pattern expr */
int like_expr(s, expr)
char *s; /* string to be matched */
char *expr; /* the pattern */
{
NodePtr p, q;
Graph nfa;
int result;
like_op = 1; /* set correct scanner */
freelist = NULL;
_c = expr;
next();
if (likePat(&nfa) == -1) {
return(-1);
}
accept = nfa.right;
result = match(nfa.left, s);
/* free memory */
p = q = freelist;
while (p != NULL) {
q = p->track;
free(p);
p = q;
}
exit_debug("like_expr");
return(result);
} /* end LIKE_EXPR */
/************************************************************************/
/************************************************************************/
/* do_not_atom() Process !a */
int do_not_atom(p, c)
char *p;
int c; /* the negated atom char */
{
static unsigned char set [256];
int i;
char *s = p;
for (i = 0; i < 256; i++) { /* all characters */
set[i] = 1;
}
set[c] = !set[c]; /* except for this one */
for (i = 1, s = p; i < 256; i++) {
if (set[i]) *s++ = i;
}
*s = '\0';
return(1);
} /* end DO_NOT_ATOM */
/************************************************************************/
/************************************************************************/
/* do_like_meta() Process EXPRESS LIKE meta character */
int do_like_meta(meta, p, complement)
int meta; /* the meta character */
char *p;
int complement; /* = 1 if complement the set */
{
static unsigned char set[256];
int i;
char *s = p;
/* ensure all character are noted in the set */
for (i = 0; i < 256; i++) set[i] = 1;
switch (meta) {
case '@' : { /* a single alphabetic character */
for (i = 0; i < 256; i++) set[i] = 0;
for (i = 'a'; i <= 'z'; i++) set[i] = 1;
for (i = 'A'; i <= 'Z'; i++) set[i] = 1;
break;
}
case '^' : { /* a single upper case alphabetic character */
for (i = 0; i < 256; i++) set[i] = 0;
for (i = 'A'; i <= 'Z'; i++) set[i] = 1;
break;
}
case '?' : { /* a single character */
break;
}
case '&' : { /* remainder of string */
break;
}
case '#' : { /* a single digit */
for (i = 0; i < 256; i++) set[i] = 0;
for (i = '0'; i <= '9'; i++) set[i] = 1;
break;
}
case '$' : { /* substring ended by space or EOS */
i = ' ';
set[i] = !set[i];
break;
}
case '*' : { /* any character */
break;
}
default : { /* an ERROR */
return(-1);
}
} /* end switch */
if (complement == 1) { /* invert the set */
for (i = 0; i < 256; i++) set[i] = !set[i];
}
/* add all the search characters to the string */
for (i = 1, s = p; i < 256; i++) {
if (set[i]) *s++ = i;
}
*s = '\0';
return(1);
} /* end DO_LIKE_META */
/************************************************************************/