/*      $NetBSD: regexp.c,v 1.16 2024/12/08 07:53:18 andvar Exp $       */

/*
* Copyright (c) 1980, 1993
*      The Regents of the University of California.  All rights reserved.
*
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
*    may be used to endorse or promote products derived from this software
*    without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/

#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif

#include <sys/cdefs.h>
#ifndef lint
__COPYRIGHT("@(#) Copyright (c) 1980, 1993\
The Regents of the University of California.  All rights reserved.");
#endif /* not lint */

#ifndef lint
#if 0
static char sccsid[] = "@(#)regexp.c    8.1 (Berkeley) 6/6/93";
#endif
__RCSID("$NetBSD: regexp.c,v 1.16 2024/12/08 07:53:18 andvar Exp $");
#endif /* not lint */

#include <assert.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include "extern.h"

static void     expconv (void);

bool     x_escaped;     /* true if we are currently x_escaped */
char    *x_start;       /* start of string */
bool     l_onecase;     /* true if upper and lower equivalent */

#define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))

static char
tocc(ptrdiff_t x) {
       if (x & ~0xff)
               abort();
       return (char)x;
}

/*  STRNCMP -   like strncmp except that we convert the
*              first string to lower case before comparing
*              if l_onecase is set.
*/

int
STRNCMP(char *s1, char *s2, int len)
{
       if (l_onecase) {
           do
               if (*s2 - makelower(*s1))
                       return *s2 - makelower(*s1);
               else {
                       s2++;
                       s1++;
               }
           while (--len);
       } else {
           do
               if (*s2 - *s1)
                       return *s2 - *s1;
               else {
                       s2++;
                       s1++;
               }
           while (--len);
       }
       return(0);
}

/*      The following routine converts an irregular expression to
*      internal format.
*
*      Either meta symbols (\a \d or \p) or character strings or
*      operations ( alternation or perenthesizing ) can be
*      specified.  Each starts with a descriptor byte.  The descriptor
*      byte has STR set for strings, META set for meta symbols
*      and OPER set for operations.
*      The descriptor byte can also have the OPT bit set if the object
*      defined is optional.  Also ALT can be set to indicate an alternation.
*
*      For metasymbols the byte following the descriptor byte identities
*      the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
*      strings the byte after the descriptor is a character count for
*      the string:
*
*              meta symbols := descriptor
*                              symbol
*
*              strings :=      descriptor
*                              character count
*                              the string
*
*              operations :=   descriptor
*                              symbol
*                              character count
*/

/*
*  handy macros for accessing parts of match blocks
*/
#define MSYM(A) (*(A+1))        /* symbol in a meta symbol block */
#define MNEXT(A) (A+2)          /* character following a metasymbol block */

#define OSYM(A) (*(A+1))        /* symbol in an operation block */
#define OCNT(A) (*(A+2))        /* character count */
#define ONEXT(A) (A+3)          /* next character after the operation */
#define OPTR(A) (A+*(A+2))      /* place pointed to by the operator */

#define SCNT(A) (*(A+1))        /* byte count of a string */
#define SSTR(A) (A+2)           /* address of the string */
#define SNEXT(A) (A+2+*(A+1))   /* character following the string */

/*
*  bit flags in the descriptor
*/
#define OPT 1
#define STR 2
#define META 4
#define ALT 8
#define OPER 16

static char *ccre;      /* pointer to current position in converted exp*/
static char *ure;       /* pointer current position in unconverted exp */

char *
convexp(char *re)       /* unconverted irregular expression */
{
   char *cre;          /* pointer to converted regular expression */

   /* allocate room for the converted expression */
   if (re == NULL)
       return NULL;
   if (*re == '\0')
       return NULL;
   cre = malloc(4 * strlen(re) + 3);
   ccre = cre;
   ure = re;

   /* start the conversion with a \a */
   *cre = META | OPT;
   MSYM(cre) = 'a';
   ccre = MNEXT(cre);

   /* start the conversion (its recursive) */
   expconv();
   *ccre = 0;
   return cre;
}

static void
expconv(void)
{
   char *cs;           /* pointer to current symbol in converted exp */
   char c;             /* character being processed */
   char *acs;          /* pinter to last alternate */
   int temp;

   /* let the conversion begin */
   acs = NULL;
   cs = NULL;
   while (*ure) {
       switch (c = *ure++) {

       case '\\':
           switch (c = *ure++) {

           /* escaped characters are just characters */
           default:
               if (cs == NULL || (*cs & STR) == 0) {
                   cs = ccre;
                   *cs = STR;
                   SCNT(cs) = 1;
                   ccre += 2;
               } else
                   SCNT(cs)++;
               *ccre++ = c;
               break;

           /* normal(?) metacharacters */
           case 'a':
           case 'd':
           case 'e':
           case 'p':
               if (acs != NULL && acs != cs) {
                   do {
                       temp = OCNT(acs);
                       OCNT(acs) = tocc(ccre - acs);
                       acs -= temp;
                   } while (temp != 0);
                   acs = NULL;
               }
               cs = ccre;
               *cs = META;
               MSYM(cs) = c;
               ccre = MNEXT(cs);
               break;
           }
           break;

       /* just put the symbol in */
       case '^':
       case '$':
           if (acs != NULL && acs != cs) {
               do {
                   temp = OCNT(acs);
                   OCNT(acs) = tocc(ccre - acs);
                   acs -= temp;
               } while (temp != 0);
               acs = NULL;
           }
           cs = ccre;
           *cs = META;
           MSYM(cs) = c;
           ccre = MNEXT(cs);
           break;

       /* mark the last match sequence as optional */
       case '?':
           if (cs)
               *cs = *cs | OPT;
           break;

       /* recurse and define a subexpression */
       case '(':
           if (acs != NULL && acs != cs) {
               do {
                   temp = OCNT(acs);
                   OCNT(acs) = tocc(ccre - acs);
                   acs -= temp;
               } while (temp != 0);
               acs = NULL;
           }
           cs = ccre;
           *cs = OPER;
           OSYM(cs) = '(';
           ccre = ONEXT(cs);
           expconv();
           OCNT(cs) = tocc(ccre - cs);         /* offset to next symbol */
           break;

       /* return from a recursion */
       case ')':
           if (acs != NULL) {
               do {
                   temp = OCNT(acs);
                   OCNT(acs) = tocc(ccre - acs);
                   acs -= temp;
               } while (temp != 0);
               acs = NULL;
           }
           cs = ccre;
           *cs = META;
           MSYM(cs) = c;
           ccre = MNEXT(cs);
           return;

       /* mark the last match sequence as having an alternate */
       /* the third byte will contain an offset to jump over the */
       /* alternate match in case the first did not fail */
       case '|':
           if (acs != NULL && acs != cs)
               OCNT(ccre) = tocc(ccre - acs);  /* make a back pointer */
           else
               OCNT(ccre) = 0;
           assert(cs != NULL);
           *cs |= ALT;
           cs = ccre;
           *cs = OPER;
           OSYM(cs) = '|';
           ccre = ONEXT(cs);
           acs = cs;   /* remember that the pointer is to be filles */
           break;

       /* if its not a metasymbol just build a scharacter string */
       default:
           if (cs == NULL || (*cs & STR) == 0) {
               cs = ccre;
               *cs = STR;
               SCNT(cs) = 1;
               ccre = SSTR(cs);
           } else
               SCNT(cs)++;
           *ccre++ = c;
           break;
       }
   }
   if (acs != NULL) {
       do {
           temp = OCNT(acs);
           OCNT(acs) = tocc(ccre - acs);
           acs -= temp;
       } while (temp != 0);
       acs = NULL;
   }
   return;
}
/* end of convertre */


/*
*      The following routine recognises an irregular expression
*      with the following special characters:
*
*              \?      -       means last match was optional
*              \a      -       matches any number of characters
*              \d      -       matches any number of spaces and tabs
*              \p      -       matches any number of alphanumeric
*                              characters. The
*                              characters matched will be copied into
*                              the area pointed to by 'name'.
*              \|      -       alternation
*              \( \)   -       grouping used mostly for alternation and
*                              optionality
*
*      The irregular expression must be translated to internal form
*      prior to calling this routine
*
*      The value returned is the pointer to the first non \a
*      character matched.
*/

char *
expmatch(
   char *s,            /* string to check for a match in */
   char *re,           /* a converted irregular expression */
   char *mstring)      /* where to put whatever matches a \p */
{
   char *cs;           /* the current symbol */
   char *ptr,*s1;      /* temporary pointer */
   bool matched;       /* a temporary bool */

   /* initial conditions */
   if (re == NULL)
       return NULL;
   cs = re;
   matched = false;

   /* loop till expression string is exhausted (or at least pretty tired) */
   while (*cs) {
       switch (*cs & (OPER | STR | META)) {

       /* try to match a string */
       case STR:
           matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
           if (matched) {

               /* hoorah it matches */
               s += SCNT(cs);
               cs = SNEXT(cs);
           } else if (*cs & ALT) {

               /* alternation, skip to next expression */
               cs = SNEXT(cs);
           } else if (*cs & OPT) {

               /* the match is optional */
               cs = SNEXT(cs);
               matched = 1;            /* indicate a successful match */
           } else {

               /* no match, error return */
               return NULL;
           }
           break;

       /* an operator, do something fancy */
       case OPER:
           switch (OSYM(cs)) {

           /* this is an alternation */
           case '|':
               if (matched)

                   /* last thing in the alternation was a match, skip ahead */
                   cs = OPTR(cs);
               else

                   /* no match, keep trying */
                   cs = ONEXT(cs);
               break;

           /* this is a grouping, recurse */
           case '(':
               ptr = expmatch(s, ONEXT(cs), mstring);
               if (ptr != NULL) {

                   /* the subexpression matched */
                   matched = 1;
                   s = ptr;
               } else if (*cs & ALT) {

                   /* alternation, skip to next expression */
                   matched = 0;
               } else if (*cs & OPT) {

                   /* the match is optional */
                   matched = 1;        /* indicate a successful match */
               } else {

                   /* no match, error return */
                   return NULL;
               }
               cs = OPTR(cs);
               break;
           }
           break;

       /* try to match a metasymbol */
       case META:
           switch (MSYM(cs)) {

           /* try to match anything and remember what was matched */
           case 'p':
               /*
                *  This is really the same as trying the match the
                *  remaining parts of the expression to any subset
                *  of the string.
                */
               s1 = s;
               do {
                   ptr = expmatch(s1, MNEXT(cs), mstring);
                   if (ptr != NULL && s1 != s) {

                       /* we have a match, remember the match */
                       strncpy(mstring, s, (size_t)(s1 - s));
                       mstring[s1 - s] = '\0';
                       return ptr;
                   } else if (ptr != NULL && (*cs & OPT)) {

                       /* it was aoptional so no match is ok */
                       return ptr;
                   } else if (ptr != NULL) {

                       /* not optional and we still matched */
                       return NULL;
                   }
                   if (!isalnum((unsigned char)*s1) && *s1 != '_')
                       return NULL;
                   if (*s1 == '\\')
                       x_escaped = x_escaped ? false : true;
                   else
                       x_escaped = false;
               } while (*s1++);
               return NULL;

           /* try to match anything */
           case 'a':
               /*
                *  This is really the same as trying the match the
                *  remaining parts of the expression to any subset
                *  of the string.
                */
               s1 = s;
               do {
                   ptr = expmatch(s1, MNEXT(cs), mstring);
                   if (ptr != NULL && s1 != s) {

                       /* we have a match */
                       return ptr;
                   } else if (ptr != NULL && (*cs & OPT)) {

                       /* it was aoptional so no match is ok */
                       return ptr;
                   } else if (ptr != NULL) {

                       /* not optional and we still matched */
                       return NULL;
                   }
                   if (*s1 == '\\')
                       x_escaped = x_escaped ? false : true;
                   else
                       x_escaped = false;
               } while (*s1++);
               return NULL;

           /* fail if we are currently x_escaped */
           case 'e':
               if (x_escaped)
                   return NULL;
               cs = MNEXT(cs);
               break;

           /* match any number of tabs and spaces */
           case 'd':
               ptr = s;
               while (*s == ' ' || *s == '\t')
                   s++;
               if (s != ptr || s == x_start) {

                   /* match, be happy */
                   matched = 1;
                   cs = MNEXT(cs);
               } else if (*s == '\n' || *s == '\0') {

                   /* match, be happy */
                   matched = 1;
                   cs = MNEXT(cs);
               } else if (*cs & ALT) {

                   /* try the next part */
                   matched = 0;
                   cs = MNEXT(cs);
               } else if (*cs & OPT) {

                   /* doesn't matter */
                   matched = 1;
                   cs = MNEXT(cs);
               } else

                   /* no match, error return */
                   return NULL;
               break;

           /* check for end of line */
           case '$':
               if (*s == '\0' || *s == '\n') {

                   /* match, be happy */
                   s++;
                   matched = 1;
                   cs = MNEXT(cs);
               } else if (*cs & ALT) {

                   /* try the next part */
                   matched = 0;
                   cs = MNEXT(cs);
               } else if (*cs & OPT) {

                   /* doesn't matter */
                   matched = 1;
                   cs = MNEXT(cs);
               } else

                   /* no match, error return */
                   return NULL;
               break;

           /* check for start of line */
           case '^':
               if (s == x_start) {

                   /* match, be happy */
                   matched = 1;
                   cs = MNEXT(cs);
               } else if (*cs & ALT) {

                   /* try the next part */
                   matched = 0;
                   cs = MNEXT(cs);
               } else if (*cs & OPT) {

                   /* doesn't matter */
                   matched = 1;
                   cs = MNEXT(cs);
               } else

                   /* no match, error return */
                   return NULL;
               break;

           /* end of a subexpression, return success */
           case ')':
               return s;
           }
           break;
       }
   }
   return s;
}