/*      $OpenBSD: util.c,v 1.36 2007/10/02 17:59:18 otto Exp $  */
/*      $FreeBSD: head/usr.bin/grep/fastgrep.c 211496 2010-08-19 09:28:59Z des $ */

/*-
* Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
* Copyright (C) 2008 Gabor Kovesdan <[email protected]>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/

/*
* XXX: This file is a speed up for grep to cover the defects of the
* regex library.  These optimizations should practically be implemented
* there keeping this code clean.  This is a future TODO, but for the
* meantime, we need to use this workaround.
*/

#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif

#include <sys/cdefs.h>
__RCSID("$NetBSD: fastgrep.c,v 1.6 2024/10/01 14:56:42 christos Exp $");

#include <limits.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
#include <wctype.h>

#include "grep.h"

static inline int       grep_cmp(const unsigned char *, const unsigned char *, size_t);
static inline void      grep_revstr(unsigned char *, int);

static void
alloc_pattern(fastgrep_t *fg, const char *pat)
{
       unsigned int i;

       fg->pattern = (unsigned char *)grep_strdup(pat);
       if (!iflag)
               return;
       for (i = 0;  fg->pattern[i]; i++)
               fg->pattern[i] = towupper((unsigned char)fg->pattern[i]);
}

static void
map_pattern(fastgrep_t *fg, int hasDot)
{
       unsigned int i;

       for (i = 0; i <= UCHAR_MAX; i++)
               fg->qsBc[i] = fg->len - hasDot;
       for (i = hasDot + 1; i < fg->len; i++) {
               unsigned char ch = fg->pattern[i];
               if (iflag) {
                       fg->qsBc[ch] = fg->len - i;
                       ch = towlower(ch);
               }
               fg->qsBc[ch] = fg->len - i;
       }
}

void
fgrepcomp(fastgrep_t *fg, const char *pat)
{
       /* Initialize. */
       fg->len = strlen(pat);
       fg->bol = false;
       fg->eol = false;
       fg->reversed = false;

       alloc_pattern(fg, pat);

       /* Preprocess pattern. */
       map_pattern(fg, 0);
}

/*
* Returns: -1 on failure, 0 on success
*/
int
fastcomp(fastgrep_t *fg, const char *pat)
{
       unsigned int i;
       int firstHalfDot = -1;
       int firstLastHalfDot = -1;
       int hasDot = 0;
       int lastHalfDot = 0;

       /* Initialize. */
       fg->len = strlen(pat);
       fg->bol = false;
       fg->eol = false;
       fg->reversed = false;
       fg->word = wflag;

       /* Remove end-of-line character ('$'). */
       if (fg->len > 0 && pat[fg->len - 1] == '$') {
               fg->eol = true;
               fg->len--;
       }

       /* Remove beginning-of-line character ('^'). */
       if (pat[0] == '^') {
               fg->bol = true;
               fg->len--;
               pat++;
       }

       if (fg->len >= 14 &&
           memcmp(pat, "[[:<:]]", 7) == 0 &&
           memcmp(pat + fg->len - 7, "[[:>:]]", 7) == 0) {
               fg->len -= 14;
               pat += 7;
               /* Word boundary is handled separately in util.c */
               fg->word = true;
       }

       /*
        * pat has been adjusted earlier to not include '^', '$' or
        * the word match character classes at the beginning and ending
        * of the string respectively.
        */
       alloc_pattern(fg, pat);

       /* Look for ways to cheat...er...avoid the full regex engine. */
       for (i = 0; i < fg->len; i++) {
               /* Can still cheat? */
               if (fg->pattern[i] == '.') {
                       hasDot = i;
                       if (i < fg->len / 2) {
                               if (firstHalfDot < 0)
                                       /* Closest dot to the beginning */
                                       firstHalfDot = i;
                       } else {
                               /* Closest dot to the end of the pattern. */
                               lastHalfDot = i;
                               if (firstLastHalfDot < 0)
                                       firstLastHalfDot = i;
                       }
               } else {
                       /* Free memory and let others know this is empty. */
                       free(fg->pattern);
                       fg->pattern = NULL;
                       return (-1);
               }
       }

       /*
        * Determine if a reverse search would be faster based on the placement
        * of the dots.
        */
       if ((!(lflag || cflag)) && ((!(fg->bol || fg->eol)) &&
           ((lastHalfDot) && ((firstHalfDot < 0) ||
           ((fg->len - (lastHalfDot + 1)) < (size_t)firstHalfDot)))) &&
           !oflag && !color) {
               fg->reversed = true;
               hasDot = fg->len - (firstHalfDot < 0 ?
                   firstLastHalfDot : firstHalfDot) - 1;
               grep_revstr(fg->pattern, fg->len);
       }

       /*
        * Normal Quick Search would require a shift based on the position the
        * next character after the comparison is within the pattern.  With
        * wildcards, the position of the last dot effects the maximum shift
        * distance.
        * The closer to the end the wild card is the slower the search.  A
        * reverse version of this algorithm would be useful for wildcards near
        * the end of the string.
        *
        * Examples:
        * Pattern      Max shift
        * -------      ---------
        * this         5
        * .his         4
        * t.is         3
        * th.s         2
        * thi.         1
        */

       /* Preprocess pattern. */
       map_pattern(fg, hasDot);

       /*
        * Put pattern back to normal after pre-processing to allow for easy
        * comparisons later.
        */
       if (fg->reversed)
               grep_revstr(fg->pattern, fg->len);

       return (0);
}

int
grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t *pmatch)
{
       unsigned int j;
       int ret = REG_NOMATCH;

       if (pmatch->rm_so == (ssize_t)len)
               return (ret);

       if (fg->bol && pmatch->rm_so != 0) {
               pmatch->rm_so = len;
               pmatch->rm_eo = len;
               return (ret);
       }

       /* No point in going farther if we do not have enough data. */
       if (len < fg->len)
               return (ret);

       /* Only try once at the beginning or ending of the line. */
       if (fg->bol || fg->eol) {
               /* Simple text comparison. */
               /* Verify data is >= pattern length before searching on it. */
               if (len >= fg->len) {
                       /* Determine where in data to start search at. */
                       j = fg->eol ? len - fg->len : 0;
                       if (!((fg->bol && fg->eol) && (len != fg->len)))
                               if (grep_cmp(fg->pattern, data + j,
                                   fg->len) == -1) {
                                       pmatch->rm_so = j;
                                       pmatch->rm_eo = j + fg->len;
                                               ret = 0;
                               }
               }
       } else if (fg->reversed) {
               /* Quick Search algorithm. */
               j = len;
               do {
                       if (grep_cmp(fg->pattern, data + j - fg->len,
                               fg->len) == -1) {
                               pmatch->rm_so = j - fg->len;
                               pmatch->rm_eo = j;
                               ret = 0;
                               break;
                       }
                       /* Shift if within bounds, otherwise, we are done. */
                       if (j == fg->len)
                               break;
                       j -= fg->qsBc[data[j - fg->len - 1]];
               } while (j >= fg->len);
       } else {
               /* Quick Search algorithm. */
               j = pmatch->rm_so;
               do {
                       if (grep_cmp(fg->pattern, data + j, fg->len) == -1) {
                               pmatch->rm_so = j;
                               pmatch->rm_eo = j + fg->len;
                               ret = 0;
                               break;
                       }

                       /* Shift if within bounds, otherwise, we are done. */
                       if (j + fg->len == len)
                               break;
                       else
                               j += fg->qsBc[data[j + fg->len]];
               } while (j <= (len - fg->len));
       }

       return (ret);
}

/*
* Returns:     i >= 0 on failure (position that it failed)
*              -1 on success
*/
static inline int
grep_cmp(const unsigned char *pat, const unsigned char *data, size_t len)
{
       size_t size;
       wchar_t *wdata, *wpat;
       unsigned int i;

       if (iflag) {
               if ((size = mbstowcs(NULL, (const char *)data, 0)) ==
                   ((size_t) - 1))
                       return (-1);

               wdata = grep_malloc(size * sizeof(wint_t));

               if (mbstowcs(wdata, (const char *)data, size) ==
                   ((size_t) - 1))
                       return (-1);

               if ((size = mbstowcs(NULL, (const char *)pat, 0)) ==
                   ((size_t) - 1))
                       return (-1);

               wpat = grep_malloc(size * sizeof(wint_t));

               if (mbstowcs(wpat, (const char *)pat, size) == ((size_t) - 1))
                       return (-1);
               for (i = 0; i < len; i++) {
                       if ((towlower(wpat[i]) == towlower(wdata[i])) ||
                           ((grepbehave != GREP_FIXED) && wpat[i] == L'.'))
                               continue;
                       free(wpat);
                       free(wdata);
                       return (i);
               }
       } else {
               for (i = 0; i < len; i++) {
                       if ((pat[i] == data[i]) || ((grepbehave != GREP_FIXED) &&
                           pat[i] == '.'))
                               continue;
                       return (i);
               }
       }
       return (-1);
}

static inline void
grep_revstr(unsigned char *str, int len)
{
       int i;
       char c;

       for (i = 0; i < len / 2; i++) {
               c = str[i];
               str[i] = str[len - i - 1];
               str[len - i - 1] = c;
       }
}