/*
* File: utf2any.l
*
* (c) Peter Kleiweg 2000
*
* This is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2,
* or (at your option) any later version.
*
* Compile:
*     flex -B -8 utf2any.l
*     gcc -s -Wall -o utf2any lex.yy.c -lfl
*     rm lex.yy.c
*
*/

%{

#define UTFanyVERSION "1.0"

/*
* MAPDIR is the directory were symbol maps are searched.
* This should be a path, ending with a slash, surrounded by double quotes,
* or it should be NULL.
*/

#ifndef MAPDIR
#  ifdef __MSDOS__
#    define MAPDIR "c:\\utf\\"
#  else
#    define MAPDIR "/usr/local/lib/utf/"
#  endif
#endif

#ifdef __MSDOS__
#  ifndef __COMPACT__
#    error Memory model COMPACT required
#  endif
#  include <dir.h>
#  include <fcntl.h>
#else
#  include <unistd.h>
#endif
#include <ctype.h>
#include <errno.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef __MSDOS__
#define strcasecmp(A, B) (stricmp((A), (B)))
#endif

#define BUFSIZE 2048

typedef enum { FALSE = 0, TRUE } BOOL_;

typedef enum { uUNDEF = 0, uUTF7, uUTF8 } UTF_;

typedef enum { aECHO, aSPACE, aSKIP, aFORMAT } ACTION_;

typedef enum { cUCHAR, cUNSIGNED, cULONG, cNONE } CAST_;

typedef struct {
   unsigned long
       ul,
       order;
   char
       *s;
} TRANS_;

typedef struct {
   long unsigned
       from,
       to;
   ACTION_
       action;
   CAST_
       cast;
   char
       *format;
} RANGE_;

BOOL_
   verbose = FALSE,
   warnings = FALSE;

UTF_
   utf_type = uUNDEF;

TRANS_
   *trans = NULL;

RANGE_
   *range = NULL;

char
   buf2    [BUFSIZE + 1],
   buffer  [BUFSIZE + 1],
   bufword [BUFSIZE + 1],
   f_unsigned [] = "[U+%04X]",
   f_ulong    [] = "[U+%08lX]",
   *infile,
   *lower [256],
   *no_mem_buffer,
   out_of_memory [] = "Out of memory",
   *programname,
   s_echo  [] = "#ECHO#",
   s_skip  [] = "#SKIP#",
   s_space [] = "#SPACE#";

int
   bufp,
   max_range = 0,
   max_trans = 0,
   n_range = 0,
   n_trans = 0,
   wtable [256];

unsigned int
   instep,
   outcode [2],
   outstep;

unsigned long
   order = 0,
   incount = 1;

void
   addchar (char *filename, int lineno, unsigned long ul, char *s),
   addaction (
       char *filename,
       int lineno,
       long unsigned from,
       long unsigned to,
       ACTION_ action,
       CAST_ cast,
       char *format
   ),
   bytes2 (void),
   bytes3 (void),
   bytes4 (void),
   bytes5 (void),
   bytes6 (void),
   codewarn (unsigned long ul),
   errit (char const *format, ...),
   ferrit (char *filename, int lineno, char const *format, ...),
   get_programname (char const *argv0),
   nextout (void),
   outchar (unsigned char i),
   outsymbol (unsigned long ul),
   readtrans (char *file, char *dir, int level),
   *s_malloc (size_t size),
   *s_realloc (void *block, size_t size),
   syntax (void),
   utf7 (void);

char
   *getbasename (char *s),
   *getdirname (char *s),
   *getword (char *filename, int lineno),
   *s_strdup (char const *s);

int
   getline (FILE *fp, int *lineno),
   nlcount (void),
   searchcmp (const void *p1, const void *p2),
   srtcmp (const void *p1, const void *p2);

long unsigned
   getvalue (char *filename, int lineno);

#define YY_NO_UNPUT
#define YY_SKIP_YYWRAP
#ifdef yywrap
#  undef yywrap
#endif
int yywrap()
{
   return 1;
}

%}

%Start _utf7 _utf7b _utf8

%%

<INITIAL>.|\n                 { yyless (0);
                               BEGIN ((utf_type == uUTF7) ? _utf7 : _utf8);
                             }

<_utf7>{
"+-"                          { outchar ('+'); }
"+"                           { instep = outstep = 0;
                               BEGIN _utf7b; }
}

<_utf7b>{
[A-Za-z0-9+/]                 { utf7 (); }
"-"                           { BEGIN _utf7; }
|\n                           { if (yytext [0] == '\n')
                                   incount++;
                               outchar (yytext [0]);
                               BEGIN _utf7; }
}

<_utf8>{
[\300-\337].                  { incount += nlcount (); bytes2 (); }
[\340-\357]..                 { incount += nlcount (); bytes3 (); }
[\360-\367]...                { incount += nlcount (); bytes4 (); }
[\370-\373]....               { incount += nlcount (); bytes5 (); }
[\374-\375].....              { incount += nlcount (); bytes6 (); }
}

<_utf7,_utf8>.|\n             { if (yytext [0] == '\n')
                                   incount++;
                               outchar (yytext [0]);
                             }

%%

/*
* Helper functions for UTF-7 parser
*/

void utf7 ()
{
   unsigned
       i,
       c;

   i = yytext [0];
   if (i >= 'A' && i <= 'Z')
       c = i - 'A';
   else if (i >= 'a' && i <= 'z')
       c = i + 26 - 'a';
   else if (i >= '0' && i <= '9')
       c = i + 52 - '0';
   else if (i == '+')
       c = 62;
   else if (i == '/')
       c = 63;

   switch (instep) {
       case 0:
           outcode [outstep] = (c << 2);
           break;
       case 1:
           outcode [outstep] |= (c >> 4);
           nextout ();
           outcode [outstep] = (c << 4);
           break;
       case 2:
           outcode [outstep] |= (c >> 2);
           nextout ();
           outcode [outstep] = (c << 6);
           break;
       case 3:
           outcode [outstep] |= c;
           nextout ();
           break;
   }
   if (++instep == 4)
       instep = 0;
}

void nextout ()
{
   unsigned
       c;

   if (outstep == 0) {
       outstep = 1;
   } else {
       outstep = 0;
       c = ((outcode [0] & 0xFF) << 8) | (outcode [1] & 0xFF);
       outsymbol (c);
   }
}

/*
* Helper functions for UTF-8 parser
*/

void bytes2 ()
{
   unsigned
       u [2],
       c;
   int
       i;

   for (i = 0; i < 2; i++)
       u [i] = (unsigned char) yytext [i];

   c =   ( u [1] & 0x3F)
       | ((u [0] & 0x1F) << 6);

   outsymbol (c);
}


void bytes3 ()
{
   unsigned
       u [3],
       c;
   int
       i;

   for (i = 0; i < 3; i++)
       u [i] = (unsigned char) yytext [i];

   c =   ( u [2] & 0x3F)
       | ((u [1] & 0x3F) <<  6)
       | ((u [0] & 0x0F) << 12);

   outsymbol (c);
}

void bytes4 ()
{
   long unsigned
       u [4],
       c;
   int
       i;

   for (i = 0; i < 4; i++)
       u [i] = (unsigned char) yytext [i];

   c =   ( u [3] & 0x3F)
       | ((u [2] & 0x3F) <<  6)
       | ((u [1] & 0x3F) << 12)
       | ((u [0] & 0x07) << 18);

   outsymbol (c);
}

void bytes5 ()
{
   long unsigned
       u [5],
       c;
   int
       i;

   for (i = 0; i < 5; i++)
       u [i] = (unsigned char) yytext [i];

   c =   ( u [4] & 0x3F)
       | ((u [3] & 0x3F) <<  6)
       | ((u [2] & 0x3F) << 12)
       | ((u [1] & 0x3F) << 18)
       | ((u [0] & 0x03) << 24);

   outsymbol (c);
}

void bytes6 ()
{
   long unsigned
       u [6],
       c;
   int
       i;

   for (i = 0; i < 6; i++)
       u [i] = (unsigned char) yytext [i];

   c =   ( u [5] & 0x3F)
       | ((u [4] & 0x3F) <<  6)
       | ((u [3] & 0x3F) << 12)
       | ((u [2] & 0x3F) << 18)
       | ((u [1] & 0x3F) << 24)
       | ((u [0] & 0x01) << 30);

   outsymbol (c);
}

/*
* General helper functions for parser
*/

void outchar (unsigned char c)
{
   if (wtable [c])
       codewarn (c);

   if (! lower [c])
       fputc (c, yyout);
   else
       fputs (lower [c], yyout);
}

void outsymbol (unsigned long ul)
{
   int
       i;
   TRANS_
       *p;

   if (ul < 256) {
       if (wtable [ul])
           codewarn (ul);
       if (! lower [ul])
           fputc ((unsigned int) ul, yyout);
       else
           fputs (lower [ul], yyout);
       return;
   }

   p = (TRANS_ *) bsearch (&ul, trans, n_trans, sizeof (TRANS_), searchcmp);
   if (p) {
       fputs (p->s, yyout);
       return;
   }

   codewarn (ul);

   for (i = n_range - 1; i >= 0; i--)
       if (ul >= range [i].from && ul <= range [i].to)
           break;
   switch (range [i].action) {
       case aSPACE:
           fputc (' ', yyout);
           break;
       case aFORMAT:
           if (range [i].cast == cUCHAR)
               fprintf (yyout, range [i].format, (unsigned char) ul);
           else if (range [i].cast == cUNSIGNED)
               fprintf (yyout, range [i].format, (unsigned) ul);
           else
               fprintf (yyout, range [i].format, (unsigned long) ul);
           break;
       case aECHO:
           /* won't happen beyond 255 */
       case aSKIP:
           break;
   }
}

void codewarn (unsigned long ul)
{
   if (! warnings)
       return;

   if (ul < 0x10000)
       fprintf (stderr, "%s:%lu: U+%04X %5u\n", infile, incount, (unsigned) ul, (unsigned) ul);
   else
       fprintf (stderr, "%s:%lu: U+%08lX %10lu\n", infile, incount, ul, ul);
}

int nlcount ()
{
   int
       i,
       sum;

   sum = 0;
   for (i = 0; yytext [i]; i++)
       if (yytext [i] == '\n')
           sum++;
   return sum;
}


int main (int argc, char *argv [])
{
   int
       i;

   no_mem_buffer = (char *) malloc (1024);

   get_programname (argv [0]);

   /* pre-defined actions */
   for (i = 0; i < 256; i++) {
       lower [i] = NULL;
       wtable [i] = 0;
   }
   addaction (NULL, 0, 0x10000, 0x7FFFFFFF, aFORMAT, cULONG,    f_ulong);
   addaction (NULL, 0,   0x100,     0xFFFF, aFORMAT, cUNSIGNED, f_unsigned);
   addaction (NULL, 0,    0x7F,       0x9F, aFORMAT, cUNSIGNED, f_unsigned);
   addaction (NULL, 0,       0,       0x1F, aFORMAT, cUNSIGNED, f_unsigned);
   addchar (NULL, 0, '\t', s_echo);
   addchar (NULL, 0, '\n', s_echo);
   addchar (NULL, 0, '\r', s_echo);
   addchar (NULL, 0, '\f', s_echo);

   while (argc > 1 && argv [1][0] == '-') {
       if (! strcmp (argv [1], "-7"))
           utf_type = uUTF7;
       else if (! strcmp (argv [1], "-8"))
           utf_type = uUTF8;
       else if (argv [1][1] == 'f') {
           if (argv [1][2])
               readtrans (argv [1] + 2, MAPDIR, 0);
           else {
               if (argc == 2)
                   errit ("Missing argument for option '-f'");
               argv++;
               argc--;
               readtrans (argv [1], MAPDIR, 0);
           }
       } else if (! strcmp (argv [1], "-v"))
           verbose = TRUE;
       else if (! strcmp (argv [1], "-w"))
           warnings = TRUE;
       else
           syntax ();
       argv++;
       argc--;
   }

   if (n_trans) {
       qsort (trans, n_trans, sizeof (TRANS_), srtcmp);
       i = 0;
       while (i < n_trans - 1)
           if (trans [i].ul == trans [i + 1].ul) {
               memmove (trans + i, trans + i + 1, (n_trans - i - 1) * sizeof (TRANS_));
               n_trans--;
           } else
               i++;
   }

   switch (argc) {
       case 1:
           if (isatty (fileno (stdin)))
               syntax ();
           yyin = stdin;
           infile = "(stdin)";
           break;
       case 2:
           yyin = fopen (argv [1], "r");
           if (! yyin)
               errit ("Opening file \"%s\": %s", argv [1], strerror (errno));
           infile = argv [1];
           break;
       default:
           syntax ();
   }

   if (! utf_type)
       errit ("Missing option '-7' or '-8'");

   yyout = stdout;

#ifdef __MSDOS__
   setmode (fileno (yyin ), O_BINARY);
   setmode (fileno (yyout), O_BINARY);
#endif

   yylex ();

   if (yyin != stdin)
       fclose (yyin);
   if (yyout != stdout)
       fclose (yyin);

   return 0;
}

int srtcmp (const void *p1, const void *p2)
{
   unsigned long
       ul1,
       ul2;


   ul1 = ((TRANS_ *)p1)->ul;
   ul2 = ((TRANS_ *)p2)->ul;
   if (ul1 < ul2)
       return -1;
   else if (ul1 > ul2)
       return 1;

   ul1 = ((TRANS_ *)p1)->order;
   ul2 = ((TRANS_ *)p2)->order;
   if (ul1 < ul2)
       return -1;
   else
       return 1;
}

int searchcmp (const void *p1, const void *p2)
{
   unsigned long
       ul1,
       ul2;

   ul1 = *((unsigned long *)p1);
   ul2 = ((TRANS_ *)p2)->ul;

   if (ul1 < ul2)
       return -1;
   else if (ul1 > ul2)
       return 1;
   else
       return 0;
}

void readtrans (char *file, char *dir, int level)
{
   int
       lineno;
   long unsigned
       from,
       to,
       ul;
   char
       *s,
       *filename,
       *basename,
       *dirname;
   FILE
       *fp;
   CAST_
       cast;

   if (level > 10)
       errit ("File \"%s\": nesting too deep", file);

   /*
    * Try opening file
    * If failure and filename has no directory part, then try in dir
    */
   filename = file;
   fp = fopen (filename, "r");
   if ((! fp) && dir) {
       basename = getbasename (filename);
       if (! strcmp (basename, filename)) {
           filename = (char *) s_malloc (
                          (strlen (basename) + strlen (dir) + 1) * sizeof (char)
                      );
           strcpy (filename, dir);
           strcat (filename, basename);
           fp = fopen (filename, "r");
       }
   }
   if (! fp)
       errit ("Opening file \"%s\": %s", filename, strerror (errno));
   if (verbose)
       fprintf (stderr, "Begin %s\n", filename);

   dirname = getdirname (filename);
   if (! dirname)
       dirname = dir;

   lineno = 0;
   while (getline (fp, &lineno)) {
       switch (buffer [bufp]) {
           /* translation for range of characters */
           case 'd':
           case 'D':
               bufp++;
               from = getvalue (filename, lineno);
               to = getvalue (filename, lineno);
               s = getword (filename, lineno);
               if (! strcasecmp (s, s_skip))
                   addaction (filename, lineno, from, to, aSKIP, cNONE, NULL);
               else if (! strcasecmp (s, s_echo))
                   addaction (filename, lineno, from, to, aECHO, cNONE, NULL);
               else if (! strcasecmp (s, s_space))
                   addaction (filename, lineno, from, to, aSPACE, cNONE, NULL);
               else {
                   if (! strcmp (s, "uchar"))
                       cast = cUCHAR;
                   else if (! strcmp (s, "unsigned"))
                       cast = cUNSIGNED;
                   else if (! strcmp (s, "ulong"))
                       cast = cULONG;
                   else
                       ferrit (filename, lineno, "Illegal action \"%s\"", s);
                   addaction (filename, lineno, from, to, aFORMAT, cast, buffer + bufp);
               }
               break;
           /* include file */
           case 'i':
           case 'I':
               bufp++;
               readtrans (s_strdup (getword (filename, lineno)), dirname, level + 1);
               break;
           /* single character translation */
           default:
               ul = getvalue (filename, lineno);
               addchar (filename, lineno, ul, buffer + bufp);
       }
   }
   fclose (fp);
   if (verbose)
       fprintf (stderr, "End %s\n", filename);
}

void addchar (char *filename, int lineno, unsigned long ul, char *s)
{
   BOOL_
       echo;

   if (ul > 0x7fffffff)
       ferrit (filename, lineno, "%s out of range: 0x%lX", s, ul);

   echo = FALSE;

   if (! strcasecmp (s, s_skip))
       s = "";
   else if (! strcasecmp (s, s_space))
       s = " ";
   else if (! strcasecmp (s, s_echo))
       echo = TRUE;

   if (ul < 256) {
       lower [ul] = echo ? NULL : s_strdup (s);
       wtable [ul] = 0;
   } else {
       if (echo)
           ferrit (filename, lineno, "%s out of range: 0x%lX", s_echo, ul);
       if (n_trans == max_trans) {
           max_trans += 1024;
           trans = (TRANS_ *) s_realloc (trans, max_trans * sizeof (TRANS_));
       }
       trans [n_trans].ul = ul;
       trans [n_trans].order = order++;
       trans [n_trans++].s = s_strdup (s);
   }
}

void addaction (char *filename,
               int lineno,
               long unsigned from,
               long unsigned to,
               ACTION_ action,
               CAST_ cast,
               char *format)
{
   long unsigned
       u;

   if (from > to)
       ferrit (filename, lineno, "Illegal range");

   if (from > 0x7fffffff)
       ferrit (filename, lineno, "Begin of range to large: 0x%lX", from);

   if (to > 0x7fffffff)
       ferrit (filename, lineno, "End of range to large: 0x%lX", to);

   for (u = from; u <= to && u < 256; u++) {
       wtable [u] = 1;
       if (action == aSKIP)
           lower [u] = "";
       else if (action == aSPACE)
           lower [u] = " ";
       else if (action == aECHO)
           lower [u] = NULL;
       else {
           if (cast == cUCHAR)
               sprintf (buf2, format, (unsigned char) u);
           else if (cast == cUNSIGNED)
               sprintf (buf2, format, (unsigned) u);
           else
               sprintf (buf2, format, (long unsigned) u);
           lower [u] = s_strdup (buf2);
       }
   }

   if (from < 256)
       from = 256;
   if (from > to)
       return;

   if (action == aECHO)
       ferrit (filename, lineno, "Out of range for %s", s_echo);

   if (n_range == max_range) {
       max_range += 256;
       range = (RANGE_ *) s_realloc (range, max_range * sizeof (RANGE_));
   }
   range [n_range].from = from;
   range [n_range].to = to;
   range [n_range].action = action;
   if (action == aFORMAT) {
       range [n_range].cast = cast;
       range [n_range].format = s_strdup (format);
   }
   n_range++;
}

long unsigned getvalue (char *filename, int lineno)
{
   long unsigned
       ulong;
   int
       n;
   char
       *format;

   while (buffer [bufp] && isspace ((unsigned char) buffer [bufp]))
       bufp++;
   if (((buffer [bufp] == 'u' || buffer [bufp] == 'U') && buffer [bufp + 1] == '+') ||
       (buffer [bufp] == '0' && (buffer [bufp + 1] == 'x' || buffer [bufp + 1] == 'X'))) {
       bufp += 2;
       format = "%lx%n";
   } else if (buffer [bufp] == '0')
       format = "%lo%n";
   else
       format = "%lu%n";
   if (sscanf (buffer + bufp, format, &ulong, &n) != 1)
       errit ("Missing value in \"%s\", line %i", filename, lineno);
   bufp += n;
   while (buffer [bufp] && isspace ((unsigned char) buffer [bufp]))
       bufp++;
   return ulong;
}

char *getword (char *filename, int lineno)
{
   int
       n;

   while (buffer [bufp] && isspace ((unsigned char) buffer [bufp]))
       bufp++;
   if (sscanf (buffer + bufp, "%s%n", bufword, &n) != 1)
       errit ("Missing word in \"%s\", line %i", filename, lineno);
   bufp += n;
   while (buffer [bufp] && isspace ((unsigned char) buffer [bufp]))
       bufp++;
   return bufword;
}

char *getbasename (char *filename)
{
   char
       *p;

#ifdef __MSDOS__
   p = strrchr (filename, '\\');
#else   /* unix */
   p = strrchr (filename, '/');
#endif
   if (p)
       return p + 1;
   else
       return filename;
}

char *getdirname (char *filename)
{
   char
       c,
       *p,
       *dir;

#ifdef __MSDOS__
   p = strrchr (filename, '\\');
#else   /* unix */
   p = strrchr (filename, '/');
#endif
   if (p) {
       c = p [1];
       p [1] = '\0';
       dir = s_strdup (filename);
       p [1] = c;
       return dir;
   } else
       return NULL;
}

int getline (FILE *fp, int *lineno)
{
   int
       i;

   for (;;) {
       if (fgets (buffer, BUFSIZE, fp) == NULL)
           return 0;
       (*lineno)++;
       i = strlen (buffer);
       while (i)
           if (isspace ((unsigned char) buffer [i - 1]))
               buffer [--i] = '\0';
           else
               break;
       bufp = 0;
       while (buffer [bufp] && isspace ((unsigned char) buffer [bufp]))
           bufp++;
       if (buffer [bufp] == '#')
           continue;
       if (buffer [bufp])
           return 1;
   }
}

void ferrit (char *filename, int lineno, char const *format, ...)
{
   va_list
       list;

   fprintf (stderr, "\nError %s: in file \"%s\", line %i: ", programname, filename, lineno);

   va_start (list, format);
   vfprintf (stderr, format, list);

   fprintf (stderr, "\n\n");

   exit (1);
}

void errit (char const *format, ...)
{
   va_list
       list;

   fprintf (stderr, "\nError %s: ", programname);

   va_start (list, format);
   vfprintf (stderr, format, list);

   fprintf (stderr, "\n\n");

   exit (1);
}

void get_programname (char const *argv0)
{
#ifdef __MSDOS__
   char
       name [MAXFILE];
   fnsplit (argv0, NULL, NULL, name, NULL);
   programname = strdup (name);
#else   /* unix */
   char
       *p;
   p = strrchr (argv0, '/');
   if (p)
       programname = strdup (p + 1);
   else
       programname = strdup (argv0);
#endif
}

void *s_malloc (size_t size)
{
   void
       *p;

   p = malloc (size);
   if (! p) {
       free (no_mem_buffer);
       errit (out_of_memory);
   }
   return p;
}

void *s_realloc (void *block, size_t size)
{
   void
       *p;

   p = realloc (block, size);
   if (! p) {
       free (no_mem_buffer);
       errit (out_of_memory);
   }
   return p;
}

char *s_strdup (char const *s)
{
   char
       *s1;

   if (s) {
       s1 = (char *) s_malloc (strlen (s) + 1);
       strcpy (s1, s);
   } else {
       s1 = (char *) s_malloc (1);
       s1 [0] = '\0';
   }
   return s1;
}

void syntax ()
{
   fprintf (
       stderr,
       "\n"
       "This is utf2any, version " UTFanyVERSION "\n"
       "\n"
       "Usage: %s -7|-8 [-f mapfile] [-v] [-w] [infile]\n"
       "\n"
       "  -7 : Input is UTF-7\n"
       "  -8 : Input is UTF-8\n"
       "  -f : File with definitions of the symbol mappings\n"
       "       If multiple -f options are given, the files are processed in turn\n"
       "  -v : Verbose\n"
       "  -w : Warning messages\n"
       "\n",
       programname
   );
   exit (1);
}