/* l2xirexp.c builds recognizers for two kinds of regular expressions */

/* l2xirexp.c builds recognizers for two kinds of regular expressions */
/* Based on rexpr.c, part of the PCCTS compiler/generator distribution */
/*
* This file contains code for
*
* int rexpr(char *expr, char *s);
*
* which answers
*
* 1 if 's' is in the language described by the regular expression 'expr'
* 0 if it is not
* -1 if the regular expression is invalid
*
* Language membership is determined by constructing a non-deterministic
* finite automata (NFA) from the regular expression. A depth-
* first-search is performed on the NFA (graph) to check for a match of 's'.
* Each non-epsilon arc consumes one character from 's'. Backtracking is
* performed to check all possible paths through the NFA.
*
* Regular expressions follow the meta-language:
*
* <regExpr> ::= <andExpr> ( '|' <andExpr> )*
*
* <andExpr> ::= <expr> ( <expr> )*
*
* <expr> ::= {'~'} '[' <atomList> ']' <repeatSymbol>
* | '(' <regExpr> ')' <repeatSymbol>
* | '{' <regExpr> '}' <repeatSymbol>
* | <atom> <repeatSymbol>
*
* <repeatSymbol> ::= { '*' | '+' }
*
* <atomList> ::= <atom> ( <atom> )*
* | { <atomList> } <atom> '-' <atom> { <atomList> }
*
* <atom> ::= Token[Atom]
*
* Notes:
* ~ means complement the set in [..]. i.e. all characters
* not listed
* * means match 0 or more times (can be on expression
* or atom)
* + means match 1 or more times (can be on expression
* or atom)
* {} optional
* () grouping
* [] set of atoms
* x-y all characters from x to y (found only in [..])
* \xx the character with value xx
*
* Examples:
* [a-z]+
* match 1 or more lower-case letters (e.g. variable)
*
* 0x[0-9A-Fa-f]+
* match a hex number with 0x on front (e.g. 0xA1FF)
*
* [0-9]+.[0-9]+{e[0-9]+}
* match a floating point number (e.g. 3.14e21)
*
* Code example:
* if ( rexpr("[a-zA-Z][a-zA-Z0-9]+", str) ) then str is keyword
*
* Terence Parr
* Purdue University
* April 1991
*/
/**********************************************************************/
/* some modifications by Peter Wilson, Catholic University of America */
/*
* Complement changed from ~[...] to [^...] to match AWK/lex
* Optional changed from {...} to (r)? to match AWK/lex
* Swapped parameters in rexpr()
*
* Major additions/changes to support the EXPRESS LIKE operator:
* string LIKE pattern -> TRUE or FALSE
*
* @ matches any letter
* ^ matches any uppercase letter
* ? matches any character
* & matches remainder of string
* # matches any digit
* $ matches any substring terminated by space or end of string
* * matches any number of characters
* \ escape character
*
*/
/**********************************************************************/

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include "l2xirexp.h"
#include "l2xiidbg.h"

static int regExpr();
static int andExpr();
static int expr();
static int repeatSymbol();
static int atomList();
static int next();
static ArcPtr newGraphArc();
static NodePtr newNode();
static int ArcBetweenGraphNode();
static Graph BuildNFA_atom();
static Graph BuildNFA_AB();
static Graph BuildNFA_AorB();
static Graph BuildNFA_set();
static Graph BuildNFA_Astar();
static Graph BuildNFA_Aplus();
static Graph BuildNFA_Aoptional();

static char *_c;
static int rx_token, tokchar;
static NodePtr accept;
static NodePtr freelist = NULL;

int like_op = 0; /* PRW = 1 iff processing EXPRESS LIKE pattern */

/*
* return 1 if s in language described by expr
* 0 if s is not
* -1 if expr is an invalid regular expression
*/
rexpr(s, expr)
char *expr, *s;
{
NodePtr p,q;
Graph nfa;
int result;

entry_debug("rexpr (l2xirexp.c)");
sprintf(dbuffer, "rexpr(%s, %s);\n", s, expr);
debug_print(dbuffer);
like_op = 0;
freelist = NULL;
_c = expr;
next();
if ( regExpr(&nfa) == -1 ) return -1;
accept = nfa.right;
result = match(nfa.left, s);
/* free all your memory */
p = q = freelist;
while ( p!=NULL ) { q = p->track; free(p); p = q; }
exit_debug("rexpr");
return result;
}

/*
* do a depth-first-search on the NFA looking for a path from start to
* accept state labelled with the characters of 's'.
*/
match(automaton, s)
NodePtr automaton;
char *s;
{
ArcPtr p;

if ( automaton == accept && *s == '\0' ) return 1; /* match */

for (p=automaton->arcs; p!=NULL; p=p->next) /* try all arcs */
{
if ( p->label == Epsilon )
{
if ( match(p->target, s) ) return 1;
}
else if ( p->label == *s )
if ( match(p->target, s+1) ) return 1;
}
return 0;
}

/*
* <regExpr> ::= <andExpr> ( '|' {<andExpr>} )*
*
* Return -1 if syntax error
* Return 0 if none found
* Return 1 if a regExrp was found
*/
static
regExpr(g)
GraphPtr g;
{
Graph g1, g2;

if ( andExpr(&g1) == -1 )
{
return -1;
}

while ( rx_token == '|' )
{
int a;
next();
a = andExpr(&g2);
if ( a == -1 ) return -1; /* syntax error below */
else if ( !a ) return 1; /* empty alternative */
g1 = BuildNFA_AorB(g1, g2);
}

if ( rx_token!='\0' ) return -1;

*g = g1;
return 1;
}

/*
* <andExpr> ::= <expr> ( <expr> )*
*/
static
andExpr(g)
GraphPtr g;
{
Graph g1, g2;

if ( expr(&g1) == -1 )
{
return -1;
}

/* PRW while ( rx_token==Atom || rx_token=='{' || rx_token=='(' || rx_token=='~' || rx_token=='[' ) */
while ( rx_token==Atom || rx_token=='(' || rx_token=='[' )
{
if (expr(&g2) == -1) return -1;
g1 = BuildNFA_AB(g1, g2);
}

*g = g1;
return 1;
}

/*
* <expr> ::= {'~'} '[' <atomList> ']' <repeatSymbol>
* | '(' <regExpr> ')' <repeatSymbol>
* | '{' <regExpr> '}' <repeatSymbol>
* | <atom> <repeatSymbol>
* PRW changed above to:
* <expr> ::= '[' {'^'} <atomList> ']' <repeatSymbol>
* | '(' <regExpr> ')' <repeatSymbol>
* | <atom> <repeatSymbol>
*/
static
expr(g)
GraphPtr g;
{
int complement = 0;
char s[257]; /* alloc space for string of char in [] */

/* PRW if ( rx_token == '~' || rx_token == '[' ) */
if ( rx_token == '[' )
{
next();
if (rx_token == '^') { complement = 1; next();}
/* PRW if ( rx_token == '~' ) {complement = 1; next();}
* if ( rx_token != '[' ) return -1;
* next();
*/
if ( atomList( s, complement ) == -1 ) return -1;
*g = BuildNFA_set( s );
if ( rx_token != ']' ) return -1;
next();
repeatSymbol( g );
return 1;
}
if ( rx_token == '(' )
{
next();
if ( regExpr( g ) == -1 ) return -1;
if ( rx_token != ')' ) return -1;
next();
repeatSymbol( g );
return 1;
}
/* PRW if ( rx_token == '{' )
* {
* next();
* if ( regExpr( g ) == -1 ) return -1;
* if ( rx_token != '}' ) return -1;
* next();
* -- S p e c i a l C a s e O p t i o n a l { } --
* if ( rx_token != '*' && rx_token != '+' )
* {
* *g = BuildNFA_Aoptional( *g );
* }
* repeatSymbol( g );
* return 1;
* }
*/
if ( rx_token == Atom )
{
*g = BuildNFA_atom( tokchar );
next();
repeatSymbol( g );
return 1;
}

return -1;
}

/*
* <repeatSymbol> ::= { '*' | '+' }
* PRW changed to ::= { '*' | '+' | '?' }
*/
static
repeatSymbol(g)
GraphPtr g;
{
switch ( rx_token )
{
case '*' : *g = BuildNFA_Astar( *g ); next(); break;
case '+' : *g = BuildNFA_Aplus( *g ); next(); break;
case '?' : *g = BuildNFA_Aoptional( *g ); next(); break;
}
return 1;
}

/*
* <atomList> ::= <atom> { <atom> }*
* { <atomList> } <atom> '-' <atom> { <atomList> }
*
* a-b is same as ab
* q-a is same as q
*/
static
atomList(p, complement)
char *p;
int complement;
{
static unsigned char set[256]; /* no duplicates */
int first, last, i;
char *s = p;

if ( rx_token != Atom ) return -1;

for (i=0; i<256; i++) set[i] = 0;
while ( rx_token == Atom )
{
if ( !set[tokchar] ) *s++ = tokchar;
set[tokchar] = 1; /* Add atom to set */
next();
if ( rx_token == '-' ) /* have we found '-' */
{
first = *(s-1); /* Get last char */
next();
if ( rx_token != Atom ) return -1;
else
{
last = tokchar;
}
for (i = first+1; i <= last; i++)
{
if ( !set[tokchar] ) *s++ = i;
set[i] = 1; /* Add atom to set */
}
next();
}
}
*s = '\0';
if ( complement )
{
for (i=0; i<256; i++) set[i] = !set[i];
for (i=1,s=p; i<256; i++) if ( set[i] ) *s++ = i;
*s = '\0';
}
return 1;
}

/* a somewhat stupid lexical analyzer */
static
next()
{
/* PRW seperate scanners for regular expression and LIKE operator */
if (like_op == 0) { /* regex scanning */
while ( *_c==' ' || *_c=='\t' || *_c=='\n' ) _c++;
if ( *_c=='\\' )
{
_c++;
if ( isdigit(*_c) )
{
int n=0;
while ( isdigit(*_c) )
{
n = n*10 + (*_c++ - '0');
}
if ( n>255 ) n=255;
tokchar = n;
}
else
{
switch (*_c)
{
case 'n' : tokchar = '\n'; break;
case 't' : tokchar = '\t'; break;
case 'r' : tokchar = '\r'; break;
default : tokchar = *_c;
}
_c++;
}
rx_token = Atom;
}
else if ( isgraph(*_c) && *_c!='['
&& *_c!='('
&& *_c!='-'
&& *_c!=')'
&& *_c!=']'
&& *_c!='?'
&& *_c!='+'
&& *_c!='*'
&& *_c!='^'
&& *_c!='|' )
{ /* PRW deleted {, } and ~ while adding ^ and ? */
rx_token = Atom;
tokchar = *_c++;
}
else
{
rx_token = tokchar = *_c++;
}
return;
} /* end of regex scanner */
else if (like_op == 1) { /* LIKE scanning */
while ( *_c==' ' || *_c=='\t' || *_c=='\n' ) _c++;
if ( *_c=='\\' )
{
_c++;
switch (*_c)
{
case 'n' : tokchar = '\n'; break;
case 't' : tokchar = '\t'; break;
case 'r' : tokchar = '\r'; break;
default : tokchar = *_c;
}
_c++;
rx_token = Atom;
}
else if ( isgraph(*_c) && *_c!='@'
&& *_c!='^'
&& *_c!='?'
&& *_c!='&'
&& *_c!='#'
&& *_c!='$'
&& *_c!='*'
&& *_c!='!' )
{
rx_token = Atom;
tokchar = *_c++;
}
else
{
rx_token = tokchar = *_c++;
}
return;
} /* end of LIKE scanner */
}

/* N F A B u i l d i n g R o u t i n e s */

static
ArcPtr
newGraphArc()
{
ArcPtr p;
p = (ArcPtr) calloc(1, sizeof(Arc));
if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
if ( freelist != NULL ) p->track = (ArcPtr) freelist;
freelist = (NodePtr) p;
return p;
}

static
NodePtr
newNode()
{
NodePtr p;
p = (NodePtr) calloc(1, sizeof(Node));
if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
if ( freelist != NULL ) p->track = freelist;
freelist = p;
return p;
}

static
ArcBetweenGraphNodes(i, j, label)
NodePtr i, j;
int label;
{
ArcPtr a;

a = newGraphArc();
if ( i->arcs == NULL ) i->arctail = i->arcs = a;
else {(i->arctail)->next = a; i->arctail = a;}
a->label = label;
a->target = j;
}

static Graph
BuildNFA_atom(label)
int label;
{
Graph g;

g.left = newNode();
g.right = newNode();
ArcBetweenGraphNodes(g.left, g.right, label);
return( g );
}

static Graph
BuildNFA_AB(A, B)
Graph A, B;
{
Graph g;

ArcBetweenGraphNodes(A.right, B.left, Epsilon);
g.left = A.left;
g.right = B.right;
return( g );
}

static Graph
BuildNFA_AorB(A, B)
Graph A, B;
{
Graph g;

g.left = newNode();
ArcBetweenGraphNodes(g.left, A.left, Epsilon);
ArcBetweenGraphNodes(g.left, B.left, Epsilon);
g.right = newNode();
ArcBetweenGraphNodes(A.right, g.right, Epsilon);
ArcBetweenGraphNodes(B.right, g.right, Epsilon);
return( g );
}

static Graph
BuildNFA_set( s )
char *s;
{
Graph g;

if ( s == NULL ) return g;

g.left = newNode();
g.right = newNode();
while ( *s != '\0' )
{
ArcBetweenGraphNodes(g.left, g.right, *s++);
}
return g;
}

static Graph
BuildNFA_Astar( A )
Graph A;
{
Graph g;

g.left = newNode();
g.right = newNode();

ArcBetweenGraphNodes(g.left, A.left, Epsilon);
ArcBetweenGraphNodes(g.left, g.right, Epsilon);
ArcBetweenGraphNodes(A.right, g.right, Epsilon);
ArcBetweenGraphNodes(A.right, A.left, Epsilon);

return( g );
}

static Graph
BuildNFA_Aplus( A )
Graph A;
{
ArcBetweenGraphNodes(A.right, A.left, Epsilon);

return( A );
}

static Graph
BuildNFA_Aoptional( A )
Graph A;
{
Graph g;

g.left = newNode();
g.right = newNode();

ArcBetweenGraphNodes(g.left, A.left, Epsilon);
ArcBetweenGraphNodes(g.left, g.right, Epsilon);
ArcBetweenGraphNodes(A.right, g.right, Epsilon);

return( g );
}

/* LIKE operator code */

/************************************************************************/
/* likeExpr() Process likeExpr */

static likeExpr(g)
GraphPtr g;
{
int complement = 0;
char s[257];
int i;
Graph g2;

if (rx_token == '\\') { /* escape char */
next();
}
if (rx_token == '!') { /* NOT char */
complement = 1;
next();
}

if (rx_token == Atom) {
if (complement == 1) {
do_not_atom(s, tokchar);
*g = BuildNFA_set(s);
}
else {
*g = BuildNFA_atom(tokchar);
}
next();
return(1);
}
else if ( rx_token == '@'
|| rx_token == '^'
|| rx_token == '?'
|| rx_token == '&'
|| rx_token == '#'
|| rx_token == '$'
|| rx_token == '*' ) {
do_like_meta(rx_token, s, complement);
*g = BuildNFA_set(s);
if ( rx_token == '&'
|| rx_token == '*'
|| rx_token == '$' ) { /* multiple chars */
*g = BuildNFA_Astar(*g);
if (rx_token == '$') { /* add 0 or more following space chars */
g2 = BuildNFA_atom(' ');
g2 = BuildNFA_Astar(g2);
*g = BuildNFA_AB(*g, g2);
}
}
next();
return(1);
}

return(-1); /* should not be here */
} /* end LIKEEXPR */
/************************************************************************/

/************************************************************************/
/* likePat() Process likePat ::= <likeExpr> ( <likeExpr> )* */

static likePat(g)
GraphPtr g;
{
Graph g1, g2;

if (likeExpr(&g1) == -1) {
return(-1);
}

while (rx_token != '\0') {
if (likeExpr(&g2) == -1) {
return(-1);
}
g1 = BuildNFA_AB(g1, g2);
}
*g = g1;
return(1);
} /* end LIKEPAT */
/************************************************************************/

/************************************************************************/
/* like_expr(expr,s) */
/* returns -1 if expr not an EXPRESS LIKE pattern */
/* 0 if s does not match pattern expr */
/* 1 if s matches pattern expr */

int like_expr(s, expr)
char *s; /* string to be matched */
char *expr; /* the pattern */
{
NodePtr p, q;
Graph nfa;
int result;

entry_debug("like_expr (l2xirexp.c)");
sprintf(dbuffer, "like_expr(%s, %s);\n", s, expr);
debug_print(dbuffer);

like_op = 1; /* set correct scanner */
freelist = NULL;
_c = expr;
next();
if (likePat(&nfa) == -1) {
return(-1);
}
accept = nfa.right;
result = match(nfa.left, s);
/* free memory */
p = q = freelist;
while (p != NULL) {
q = p->track;
free(p);
p = q;
}
exit_debug("like_expr");
return(result);
} /* end LIKE_EXPR */
/************************************************************************/

/************************************************************************/
/* do_not_atom() Process !a */

int do_not_atom(p, c)
char *p;
int c; /* the negated atom char */
{
static unsigned char set [256];
int i;
char *s = p;

for (i = 0; i < 256; i++) { /* all characters */
set[i] = 1;
}
set[c] = !set[c]; /* except for this one */

for (i = 1, s = p; i < 256; i++) {
if (set[i]) *s++ = i;
}
*s = '\0';
return(1);
} /* end DO_NOT_ATOM */
/************************************************************************/

/************************************************************************/
/* do_like_meta() Process EXPRESS LIKE meta character */

int do_like_meta(meta, p, complement)
int meta; /* the meta character */
char *p;
int complement; /* = 1 if complement the set */
{
static unsigned char set[256];
int i;
char *s = p;

/* ensure all character are noted in the set */
for (i = 0; i < 256; i++) set[i] = 1;

switch (meta) {
case '@' : { /* a single alphabetic character */
for (i = 0; i < 256; i++) set[i] = 0;
for (i = 'a'; i <= 'z'; i++) set[i] = 1;
for (i = 'A'; i <= 'Z'; i++) set[i] = 1;
break;
}
case '^' : { /* a single upper case alphabetic character */
for (i = 0; i < 256; i++) set[i] = 0;
for (i = 'A'; i <= 'Z'; i++) set[i] = 1;
break;
}
case '?' : { /* a single character */
break;
}
case '&' : { /* remainder of string */
break;
}
case '#' : { /* a single digit */
for (i = 0; i < 256; i++) set[i] = 0;
for (i = '0'; i <= '9'; i++) set[i] = 1;
break;
}
case '$' : { /* substring ended by space or EOS */
i = ' ';
set[i] = !set[i];
break;
}
case '*' : { /* any character */
break;
}
default : { /* an ERROR */
return(-1);
}
} /* end switch */

if (complement == 1) { /* invert the set */
for (i = 0; i < 256; i++) set[i] = !set[i];
}

/* add all the search characters to the string */
for (i = 1, s = p; i < 256; i++) {
if (set[i]) *s++ = i;
}
*s = '\0';
return(1);
} /* end DO_LIKE_META */
/************************************************************************/