/*      $NetBSD: ntp_scanner.c,v 1.15 2024/08/18 20:47:18 christos Exp $        */


/* ntp_scanner.c
*
* The source code for a simple lexical analyzer.
*
* Written By:  Sachin Kamboj
*              University of Delaware
*              Newark, DE 19711
* Copyright (c) 2006
*/

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>

#include "ntpd.h"
#include "ntp_config.h"
#include "ntpsim.h"
#include "ntp_scanner.h"
#include "ntp_parser.h"

/* ntp_keyword.h declares finite state machine and token text */
#include "ntp_keyword.h"



/* SCANNER GLOBAL VARIABLES
* ------------------------
*/

#define MAX_LEXEME      128     /* The maximum size of a lexeme */
char yytext[MAX_LEXEME];        /* Buffer for storing the input text/lexeme */
u_int32 conf_file_sum;          /* Simple sum of characters read */

static struct FILE_INFO * lex_stack = NULL;



/* CONSTANTS
* ---------
*/


/* SCANNER GLOBAL VARIABLES
* ------------------------
*/
const char special_chars[] = "{}(),;|=";


/* FUNCTIONS
* ---------
*/

static int is_keyword(char *lexeme, follby *pfollowedby);


/*
* keyword() - Return the keyword associated with token T_ identifier.
*             See also token_name() for the string-ized T_ identifier.
*             Example: keyword(T_Server) returns "server"
*                      token_name(T_Server) returns "T_Server"
*/
const char *
keyword(
       int token
       )
{
       size_t i;
       const char *text;
       static char sbuf[64];

       i = token - LOWEST_KEYWORD_ID;

       switch (token) {
           case T_ServerresponseFuzz:
               text = "serverresponse fuzz";
               break;

           default:
               if (i < COUNTOF(keyword_text)) {
                       text = keyword_text[i];
               } else {
                       snprintf(sbuf, sizeof sbuf,
                               "(keyword #%u not found)", token);
                       text = sbuf;
               }
       }

       return text;
}


/* FILE & STRING BUFFER INTERFACE
* ------------------------------
*
* This set out as a couple of wrapper functions around the standard C
* fgetc and ungetc functions in order to include positional
* bookkeeping. Alas, this is no longer a good solution with nested
* input files and the possibility to send configuration commands via
* 'ntpdc' and 'ntpq'.
*
* Now there are a few functions to maintain a stack of nested input
* sources (though nesting is only allowd for disk files) and from the
* scanner / parser point of view there's no difference between both
* types of sources.
*
* The 'fgetc()' / 'ungetc()' replacements now operate on a FILE_INFO
* structure. Instead of trying different 'ungetc()' strategies for file
* and buffer based parsing, we keep the backup char in our own
* FILE_INFO structure. This is sufficient, as the parser does *not*
* jump around via 'seek' or the like, and there's no need to
* check/clear the backup store in other places than 'lex_getch()'.
*/

/*
* Allocate an info structure and attach it to a file.
*
* Note: When 'mode' is NULL, then the INFO block will be set up to
* contain a NULL file pointer, as suited for remote config command
* parsing. Otherwise having a NULL file pointer is considered an error,
* and a NULL info block pointer is returned to indicate failure!
*
* Note: We use a variable-sized structure to hold a copy of the file
* name (or, more proper, the input source description). This is more
* secure than keeping a reference to some other storage that might go
* out of scope.
*/
static struct FILE_INFO *
lex_open(
       const char *path,
       const char *mode
       )
{
       struct FILE_INFO *stream;
       size_t            nnambuf;

       nnambuf = strlen(path);
       stream = emalloc_zero(sizeof(*stream) + nnambuf);
       stream->curpos.nline = 1;
       stream->backch = EOF;
       /* copy name with memcpy -- trailing NUL already there! */
       memcpy(stream->fname, path, nnambuf);

       if (NULL != mode) {
               stream->fpi = fopen(path, mode);
               if (NULL == stream->fpi) {
                       free(stream);
                       stream = NULL;
               }
       }
       return stream;
}

/* get next character from buffer or file. This will return any putback
* character first; it will also make sure the last line is at least
* virtually terminated with a '\n'.
*/
static int
lex_getch(
       struct FILE_INFO *stream
       )
{
       int ch;

       if (NULL == stream || stream->force_eof)
               return EOF;

       if (EOF != stream->backch) {
               ch = stream->backch;
               stream->backch = EOF;
               if (stream->fpi)
                       conf_file_sum += ch;
               stream->curpos.ncol++;
       } else if (stream->fpi) {
               /* fetch next 7-bit ASCII char (or EOF) from file */
               while ((ch = fgetc(stream->fpi)) != EOF && ch > SCHAR_MAX)
                       stream->curpos.ncol++;
               if (EOF != ch) {
                       conf_file_sum += ch;
                       stream->curpos.ncol++;
               }
       } else {
               /* fetch next 7-bit ASCII char from buffer */
               const char * scan;
               scan = &remote_config.buffer[remote_config.pos];
               while ((ch = (u_char)*scan) > SCHAR_MAX) {
                       scan++;
                       stream->curpos.ncol++;
               }
               if ('\0' != ch) {
                       scan++;
                       stream->curpos.ncol++;
               } else {
                       ch = EOF;
               }
               remote_config.pos = (int)(scan - remote_config.buffer);
       }

       /* If the last line ends without '\n', generate one. This
        * happens most likely on Windows, where editors often have a
        * sloppy concept of a line.
        */
       if (EOF == ch && stream->curpos.ncol != 0)
               ch = '\n';

       /* update scan position tallies */
       if (ch == '\n') {
               stream->bakpos = stream->curpos;
               stream->curpos.nline++;
               stream->curpos.ncol = 0;
       }

       return ch;
}

/* Note: lex_ungetch will fail to track more than one line of push
* back. But since it guarantees only one char of back storage anyway,
* this should not be a problem.
*/
static int
lex_ungetch(
       int ch,
       struct FILE_INFO *stream
       )
{
       /* check preconditions */
       if (NULL == stream || stream->force_eof)
               return EOF;
       if (EOF != stream->backch || EOF == ch)
               return EOF;

       /* keep for later reference and update checksum */
       stream->backch = (u_char)ch;
       if (stream->fpi)
               conf_file_sum -= stream->backch;

       /* update position */
       if (stream->backch == '\n') {
           stream->curpos = stream->bakpos;
           stream->bakpos.ncol = -1;
       }
       stream->curpos.ncol--;
       return stream->backch;
}

/* dispose of an input structure. If the file pointer is not NULL, close
* the file. This function does not check the result of 'fclose()'.
*/
static void
lex_close(
       struct FILE_INFO *stream
       )
{
       if (NULL != stream) {
               if (NULL != stream->fpi)
                       fclose(stream->fpi);
               free(stream);
       }
}

/* INPUT STACK
* -----------
*
* Nested input sources are a bit tricky at first glance. We deal with
* this problem using a stack of input sources, that is, a forward
* linked list of FILE_INFO structs.
*
* This stack is never empty during parsing; while an encounter with EOF
* can and will remove nested input sources, removing the last element
* in the stack will not work during parsing, and the EOF condition of
* the outermost input file remains until the parser folds up.
*/

static struct FILE_INFO *
drop_stack_do(
       struct FILE_INFO * head
       )
{
       struct FILE_INFO * tail;
       while (NULL != head) {
               tail = head->st_next;
               lex_close(head);
               head = tail;
       }
       return head;
}



/* Create a singleton input source on an empty lexer stack. This will
* fail if there is already an input source, or if the underlying disk
* file cannot be opened.
*
* Returns TRUE if a new input object was successfully created.
*/
int/*BOOL*/
lex_init_stack(
       const char * path,
       const char * mode
       )
{
       if (NULL != lex_stack || NULL == path)
               return FALSE;

       lex_stack = lex_open(path, mode);
       return (NULL != lex_stack);
}

/* This removes *all* input sources from the stack, leaving the head
* pointer as NULL. Any attempt to parse in that state is likely to bomb
* with segmentation faults or the like.
*
* In other words: Use this to clean up after parsing, and do not parse
* anything until the next 'lex_init_stack()' succeeded.
*/
void
lex_drop_stack(void)
{
       lex_stack = drop_stack_do(lex_stack);
}

/* Flush the lexer input stack: This will nip all input objects on the
* stack (but keeps the current top-of-stack) and marks the top-of-stack
* as inactive. Any further calls to lex_getch yield only EOF, and it's
* no longer possible to push something back.
*
* Returns TRUE if there is a head element (top-of-stack) that was not
* in the force-eof mode before this call.
*/
int/*BOOL*/
lex_flush_stack(void)
{
       int retv = FALSE;

       if (NULL != lex_stack) {
               retv = !lex_stack->force_eof;
               lex_stack->force_eof = TRUE;
               lex_stack->st_next = drop_stack_do(
                                       lex_stack->st_next);
       }
       return retv;
}

/* Push another file on the parsing stack. If the mode is NULL, create a
* FILE_INFO suitable for in-memory parsing; otherwise, create a
* FILE_INFO that is bound to a local/disc file. Note that 'path' must
* not be NULL, or the function will fail.
*
* Returns TRUE if a new info record was pushed onto the stack.
*/
int/*BOOL*/ lex_push_file(
       const char * path,
       const char * mode
       )
{
       struct FILE_INFO * next = NULL;

       if (NULL != path) {
               next = lex_open(path, mode);
               if (NULL != next) {
                       next->st_next = lex_stack;
                       lex_stack = next;
               }
       }
       return (NULL != next);
}

/* Pop, close & free the top of the include stack, unless the stack
* contains only a singleton input object. In that case the function
* fails, because the parser does not expect the input stack to be
* empty.
*
* Returns TRUE if an object was successfuly popped from the stack.
*/
int/*BOOL*/
lex_pop_file(void)
{
       struct FILE_INFO * head = lex_stack;
       struct FILE_INFO * tail = NULL;

       if (NULL != head) {
               tail = head->st_next;
               if (NULL != tail) {
                       lex_stack = tail;
                       lex_close(head);
               }
       }
       return (NULL != tail);
}

/* Get include nesting level. This currently loops over the stack and
* counts elements; but since this is of concern only with an include
* statement and the nesting depth has a small limit, there's no
* bottleneck expected here.
*
* Returns the nesting level of includes, that is, the current depth of
* the lexer input stack.
*
* Note:
*/
size_t
lex_level(void)
{
       size_t            cnt = 0;
       struct FILE_INFO *ipf = lex_stack;

       while (NULL != ipf) {
               cnt++;
               ipf = ipf->st_next;
       }
       return cnt;
}

/* check if the current input is from a file */
int/*BOOL*/
lex_from_file(void)
{
       return (NULL != lex_stack) && (NULL != lex_stack->fpi);
}

struct FILE_INFO *
lex_current(void)
{
       /* this became so simple, it could be a macro. But then,
        * lex_stack needed to be global...
        */
       return lex_stack;
}


/* STATE MACHINES
* --------------
*/

/* Keywords */
static int
is_keyword(
       char *lexeme,
       follby *pfollowedby
       )
{
       follby fb;
       int curr_s;             /* current state index */
       int token;
       int i;

       curr_s = SCANNER_INIT_S;
       token = 0;

       for (i = 0; lexeme[i]; i++) {
               while (curr_s && (lexeme[i] != SS_CH(sst[curr_s])))
                       curr_s = SS_OTHER_N(sst[curr_s]);

               if (curr_s && (lexeme[i] == SS_CH(sst[curr_s]))) {
                       if ('\0' == lexeme[i + 1]
                           && FOLLBY_NON_ACCEPTING
                              != SS_FB(sst[curr_s])) {
                               fb = SS_FB(sst[curr_s]);
                               *pfollowedby = fb;
                               token = curr_s;
                               break;
                       }
                       curr_s = SS_MATCH_N(sst[curr_s]);
               } else
                       break;
       }

       return token;
}


/* Integer */
static int
is_integer(
       char *lexeme
       )
{
       int     i;
       int     is_neg;
       u_int   u_val;

       i = 0;

       /* Allow a leading minus sign */
       if (lexeme[i] == '-') {
               i++;
               is_neg = TRUE;
       } else {
               is_neg = FALSE;
       }

       /* Check that all the remaining characters are digits */
       for (; lexeme[i] != '\0'; i++) {
               if (!isdigit((u_char)lexeme[i]))
                       return FALSE;
       }

       if (is_neg)
               return TRUE;

       /* Reject numbers that fit in unsigned but not in signed int */
       if (1 == sscanf(lexeme, "%u", &u_val))
               return (u_val <= INT_MAX);
       else
               return FALSE;
}


/* U_int -- assumes is_integer() has returned FALSE */
static int
is_u_int(
       char *lexeme
       )
{
       int     i;
       int     is_hex;

       i = 0;
       if ('0' == lexeme[i] && 'x' == tolower((u_char)lexeme[i + 1])) {
               i += 2;
               is_hex = TRUE;
       } else {
               is_hex = FALSE;
       }

       /* Check that all the remaining characters are digits */
       for (; lexeme[i] != '\0'; i++) {
               if (is_hex && !isxdigit((u_char)lexeme[i]))
                       return FALSE;
               if (!is_hex && !isdigit((u_char)lexeme[i]))
                       return FALSE;
       }

       return TRUE;
}


/* Double */
static int
is_double(
       char *lexeme
       )
{
       u_int num_digits = 0;  /* Number of digits read */
       u_int i;

       i = 0;

       /* Check for an optional '+' or '-' */
       if ('+' == lexeme[i] || '-' == lexeme[i])
               i++;

       /* Read the integer part */
       for (; lexeme[i] && isdigit((u_char)lexeme[i]); i++)
               num_digits++;

       /* Check for the optional decimal point */
       if ('.' == lexeme[i]) {
               i++;
               /* Check for any digits after the decimal point */
               for (; lexeme[i] && isdigit((u_char)lexeme[i]); i++)
                       num_digits++;
       }

       /*
        * The number of digits in both the decimal part and the
        * fraction part must not be zero at this point
        */
       if (!num_digits)
               return 0;

       /* Check if we are done */
       if (!lexeme[i])
               return 1;

       /* There is still more input, read the exponent */
       if ('e' == tolower((u_char)lexeme[i]))
               i++;
       else
               return 0;

       /* Read an optional Sign */
       if ('+' == lexeme[i] || '-' == lexeme[i])
               i++;

       /* Now read the exponent part */
       while (lexeme[i] && isdigit((u_char)lexeme[i]))
               i++;

       /* Check if we are done */
       if (!lexeme[i])
               return 1;
       else
               return 0;
}


/* is_special() - Test whether a character is a token */
static inline int
is_special(
       int ch
       )
{
       return strchr(special_chars, ch) != NULL;
}


static int
is_EOC(
       int ch
       )
{
       if ((old_config_style && (ch == '\n')) ||
           (!old_config_style && (ch == ';')))
               return 1;
       return 0;
}


char *
quote_if_needed(char *str)
{
       char *ret;
       size_t len;
       size_t octets;

       len = strlen(str);
       octets = len + 2 + 1;
       ret = emalloc(octets);
       if ('"' != str[0]
           && (strcspn(str, special_chars) < len
               || strchr(str, ' ') != NULL)) {
               snprintf(ret, octets, "\"%s\"", str);
       } else
               strlcpy(ret, str, octets);

       return ret;
}


static int
create_string_token(
       char *lexeme
       )
{
       char *pch;

       /*
        * ignore end of line whitespace
        */
       pch = lexeme;
       while (*pch && isspace((u_char)*pch))
               pch++;

       if (!*pch) {
               yylval.Integer = T_EOC;
               return yylval.Integer;
       }

       yylval.String = estrdup(lexeme);
       return T_String;
}


/*
* yylex() - function that does the actual scanning.
* Bison expects this function to be called yylex and for it to take no
* input and return an int.
* Conceptually yylex "returns" yylval as well as the actual return
* value representing the token or type.
*/
int
yylex(void)
{
       static follby   followedby = FOLLBY_TOKEN;
       size_t          i;
       int             instring;
       int             yylval_was_set;
       int             converted;
       int             token;          /* The return value */
       int             ch;

       instring = FALSE;
       yylval_was_set = FALSE;

       do {
               /* Ignore whitespace at the beginning */
               while (EOF != (ch = lex_getch(lex_stack)) &&
                      isspace(ch) &&
                      !is_EOC(ch))

                       ; /* Null Statement */

               if (EOF == ch) {

                       if ( ! lex_pop_file())
                               return 0;
                       token = T_EOC;
                       goto normal_return;

               } else if (is_EOC(ch)) {

                       /* end FOLLBY_STRINGS_TO_EOC effect */
                       followedby = FOLLBY_TOKEN;
                       token = T_EOC;
                       goto normal_return;

               } else if (is_special(ch) && FOLLBY_TOKEN == followedby) {
                       /* special chars are their own token values */
                       token = ch;
                       /*
                        * '=' outside simulator configuration implies
                        * a single string following as in:
                        * setvar Owner = "The Boss" default
                        */
                       if ('=' == ch && old_config_style)
                               followedby = FOLLBY_STRING;
                       yytext[0] = (char)ch;
                       yytext[1] = '\0';
                       goto normal_return;
               } else
                       lex_ungetch(ch, lex_stack);

               /* save the position of start of the token */
               lex_stack->tokpos = lex_stack->curpos;

               /* Read in the lexeme */
               i = 0;
               while (EOF != (ch = lex_getch(lex_stack))) {

                       yytext[i] = (char)ch;

                       /* Break on whitespace or a special character */
                       if (isspace(ch) || is_EOC(ch)
                           || '"' == ch
                           || (FOLLBY_TOKEN == followedby
                               && is_special(ch)))
                               break;

                       /* Read the rest of the line on reading a start
                          of comment character */
                       if ('#' == ch) {
                               while (EOF != (ch = lex_getch(lex_stack))
                                      && '\n' != ch)
                                       ; /* Null Statement */
                               break;
                       }

                       i++;
                       if (i >= COUNTOF(yytext))
                               goto lex_too_long;
               }
               /* Pick up all of the string inside between " marks, to
                * end of line.  If we make it to EOL without a
                * terminating " assume it for them.
                *
                * XXX - HMS: I'm not sure we want to assume the closing "
                */
               if ('"' == ch) {
                       instring = TRUE;
                       while (EOF != (ch = lex_getch(lex_stack)) &&
                              ch != '"' && ch != '\n') {
                               yytext[i++] = (char)ch;
                               if (i >= COUNTOF(yytext))
                                       goto lex_too_long;
                       }
                       /*
                        * yytext[i] will be pushed back as not part of
                        * this lexeme, but any closing quote should
                        * not be pushed back, so we read another char.
                        */
                       if ('"' == ch)
                               ch = lex_getch(lex_stack);
               }
               /* Pushback the last character read that is not a part
                * of this lexeme. This fails silently if ch is EOF,
                * but then the EOF condition persists and is handled on
                * the next turn by the include stack mechanism.
                */
               lex_ungetch(ch, lex_stack);

               yytext[i] = '\0';
       } while (i == 0);

       /* Now return the desired token */

       /* First make sure that the parser is *not* expecting a string
        * as the next token (based on the previous token that was
        * returned) and that we haven't read a string.
        */

       if (followedby == FOLLBY_TOKEN && !instring) {
               token = is_keyword(yytext, &followedby);
               if (token) {
                       /*
                        * T_Server is exceptional as it forces the
                        * following token to be a string in the
                        * non-simulator parts of the configuration,
                        * but in the simulator configuration section,
                        * "server" is followed by "=" which must be
                        * recognized as a token not a string.
                        */
                       if (T_Server == token && !old_config_style)
                               followedby = FOLLBY_TOKEN;
                       goto normal_return;
               } else if (is_integer(yytext)) {
                       yylval_was_set = TRUE;
                       errno = 0;
                       if ((yylval.Integer = strtol(yytext, NULL, 10)) == 0
                           && ((errno == EINVAL) || (errno == ERANGE))) {
                               msyslog(LOG_ERR,
                                       "Integer cannot be represented: %s",
                                       yytext);
                               if (lex_from_file()) {
                                       exit(1);
                               } else {
                                       /* force end of parsing */
                                       yylval.Integer = 0;
                                       return 0;
                               }
                       }
                       token = T_Integer;
                       goto normal_return;
               } else if (is_u_int(yytext)) {
                       yylval_was_set = TRUE;
                       if ('0' == yytext[0] &&
                           'x' == tolower((unsigned long)yytext[1]))
                               converted = sscanf(&yytext[2], "%x",
                                                  &yylval.U_int);
                       else
                               converted = sscanf(yytext, "%u",
                                                  &yylval.U_int);
                       if (1 != converted) {
                               msyslog(LOG_ERR,
                                       "U_int cannot be represented: %s",
                                       yytext);
                               if (lex_from_file()) {
                                       exit(1);
                               } else {
                                       /* force end of parsing */
                                       yylval.Integer = 0;
                                       return 0;
                               }
                       }
                       token = T_U_int;
                       goto normal_return;
               } else if (is_double(yytext)) {
                       yylval_was_set = TRUE;
                       errno = 0;
                       if ((yylval.Double = atof(yytext)) == 0 && errno == ERANGE) {
                               msyslog(LOG_ERR,
                                       "Double too large to represent: %s",
                                       yytext);
                               exit(1);
                       } else {
                               token = T_Double;
                               goto normal_return;
                       }
               } else {
                       /* Default: Everything is a string */
                       yylval_was_set = TRUE;
                       token = create_string_token(yytext);
                       goto normal_return;
               }
       }

       /*
        * Either followedby is not FOLLBY_TOKEN or this lexeme is part
        * of a string.  Hence, we need to return T_String.
        *
        * _Except_ we might have a -4 or -6 flag on a an association
        * configuration line (server, peer, pool, etc.).
        *
        * This is a terrible hack, but the grammar is ambiguous so we
        * don't have a choice.  [SK]
        *
        * The ambiguity is in the keyword scanner, not ntp_parser.y.
        * We do not require server addresses be quoted in ntp.conf,
        * complicating the scanner's job.  To avoid trying (and
        * failing) to match an IP address or DNS name to a keyword,
        * the association keywords use FOLLBY_STRING in the keyword
        * table, which tells the scanner to force the next token to be
        * a T_String, so it does not try to match a keyword but rather
        * expects a string when -4/-6 modifiers to server, peer, etc.
        * are encountered.
        * restrict -4 and restrict -6 parsing works correctly without
        * this hack, as restrict uses FOLLBY_TOKEN.  [DH]
        */
       if ('-' == yytext[0]) {
               if ('4' == yytext[1]) {
                       token = T_Ipv4_flag;
                       goto normal_return;
               } else if ('6' == yytext[1]) {
                       token = T_Ipv6_flag;
                       goto normal_return;
               }
       }

       if (FOLLBY_STRING == followedby)
               followedby = FOLLBY_TOKEN;

       yylval_was_set = TRUE;
       token = create_string_token(yytext);

normal_return:
       if (T_EOC == token)
               DPRINTF(10, ("\t<end of command>\n"));
       else
               DPRINTF(10, ("yylex: lexeme '%s' -> %s\n", yytext,
                           token_name(token)));

       if (!yylval_was_set)
               yylval.Integer = token;

       return token;

lex_too_long:
       /*
        * DLH: What is the purpose of the limit of 50?
        * Is there any reason for yytext[] to be bigger?
        */
       yytext[min(sizeof(yytext) - 1, 50)] = 0;
       msyslog(LOG_ERR,
               "configuration item on line %d longer than limit of %lu, began with '%s'",
               lex_stack->curpos.nline, (u_long)min(sizeof(yytext) - 1, 50),
               yytext);

       /*
        * If we hit the length limit reading the startup configuration
        * file, abort.
        */
       if (lex_from_file())
               exit(sizeof(yytext) - 1);

       /*
        * If it's runtime configuration via ntpq :config treat it as
        * if the configuration text ended before the too-long lexeme,
        * hostname, or string.
        */
       yylval.Integer = 0;
       return 0;
}