/* #define DEBUG */
#include <stdio.h>
#include "sqcom.h"
#include "sq.h"
#define ERROR -1
#define TRUE 1
#define FALSE 0

/******** Second translation - bytes to variable length bit strings *********/


/* This translation uses the Huffman algorithm to develop a
* binary tree representing the decoding information for
* a variable length bit string code for each input value.
* Each string's length is in inverse proportion to its
* frequency of appearance in the incoming data stream.
* The encoding table is derived from the decoding table.
*
* The range of valid values into the Huffman algorithm are
* the values of a byte stored in an integer plus the special
* endfile value chosen to be an adjacent value. Overall, 0-SPEOF.
*
* The "node" array of structures contains the nodes of the
* binary tree. The first NUMVALS nodes are the leaves of the
* tree and represent the values of the data bytes being
* encoded and the special endfile, SPEOF.
* The remaining nodes become the internal nodes of the tree.
*
* In the original design it was believed that
* a Huffman code would fit in the same number of
* bits that will hold the sum of all the counts.
* That was disproven by a user's file and was a rare but
* infamous bug. This version attempts to choose among equally
* weighted subtrees according to their maximum depths to avoid
* unnecessarily long codes. In case that is not sufficient
* to guarantee codes <= 16 bits long, we initially scale
* the counts so the total fits in an unsigned integer, but
* if codes longer than 16 bits are generated the counts are
* rescaled to a lower ceiling and code generation is retried.
*/

/* Initialize the Huffman translation. This requires reading
* the input file through any preceding translation functions
* to get the frequency distribution of the various values.
*/

init_huff(ib)
FILE *ib;
{
       int c, i;
       int btlist[NUMVALS];    /* list of intermediate binary trees */
       int listlen;            /* length of btlist */
       unsigned int *wp;       /* simplifies weight counting */
       unsigned int ceiling;   /* limit for scaling */

       /* Initialize tree nodes to no weight, no children */
       init_tree();

       /* Build frequency info in tree */
       do {
               c = getcnr(ib);
               if(c == EOF)
                       c = SPEOF;
               if(*(wp = &node[c].weight) !=  MAXCOUNT)
                       ++(*wp);
       } while(c != SPEOF);
#ifdef DEBUG
       pcounts();      /* debugging aid */
#endif
       ceiling = MAXCOUNT;

       do {    /* Keep trying to scale and encode */
               if(ceiling != MAXCOUNT)
                       printf("*** rescaling ***, ");
               scale(ceiling);
               ceiling /= 2;   /* in case we rescale */
#ifdef DEBUG
               pcounts();      /* debugging aid */
#endif

               /* Build list of single node binary trees having
                * leaves for the input values with non-zero counts
                */
               for(i = listlen = 0; i < NUMVALS; ++i)
                       if(node[i].weight != 0) {
                               node[i].tdepth = 0;
                               btlist[listlen++] = i;
                       }

               /* Arrange list of trees into a heap with the entry
                * indexing the node with the least weight at the top.
                */
               heap(btlist, listlen);

               /* Convert the list of trees to a single decoding tree */
               bld_tree(btlist, listlen);

               /* Initialize the encoding table */
               init_enc();

               /* Try to build encoding table.
                * Fail if any code is > 16 bits long.
                */
       } while(buildenc(0, dctreehd) == ERROR);
#ifdef DEBUG
       phuff();        /* debugging aid */
#endif
       /* Initialize encoding variables */
       cbitsrem = 0;   /*force initial read */
       curin = 0;      /*anything but endfile*/
}
/* */
/* The count of number of occurrances of each input value
* have already been prevented from exceeding MAXCOUNT.
* Now we must scale them so that their sum doesn't exceed
* ceiling and yet no non-zero count can become zero.
* This scaling prevents errors in the weights of the
* interior nodes of the Huffman tree and also ensures that
* the codes will fit in an unsigned integer. Rescaling is
* used if necessary to limit the code length.
*/

scale(ceil)
unsigned int ceil;      /* upper limit on total weight */
{
       int c, ovflw, divisor, i;
       unsigned int w, sum;
       char increased;         /* flag */

       do {
               for(i = sum = ovflw = 0; i < NUMVALS; ++i) {
                       if(node[i].weight > (ceil - sum))
                               ++ovflw;
                       sum += node[i].weight;
               }

               divisor = ovflw + 1;

               /* Ensure no non-zero values are lost */
               increased = FALSE;
               for(i = 0; i < NUMVALS; ++i) {
                       w = node[i].weight;
                       if (w < divisor && w > 0) {
                               /* Don't fail to provide a code if it's used at all */
                               node[i].weight = divisor;
                               increased = TRUE;
                       }
               }
       } while(increased);

       /* Scaling factor choosen, now scale */
       if(divisor > 1)
               for(i = 0; i < NUMVALS; ++i)
                       node[i].weight /= divisor;
}
/* */
/* heap() and adjust() maintain a list of binary trees as a
* heap with the top indexing the binary tree on the list
* which has the least weight or, in case of equal weights,
* least depth in its longest path. The depth part is not
* strictly necessary, but tends to avoid long codes which
* might provoke rescaling.
*/

heap(list, length)
int list[], length;
{
       int i;

       for(i = (length - 2) / 2; i >= 0; --i)
               adjust(list, i, length - 1);
}

/* Make a heap from a heap with a new top */

adjust(list, top, bottom)
int list[], top, bottom;
{
       int k, temp;
       char cmptrees();

       k = 2 * top + 1;        /* left child of top */
       temp = list[top];       /* remember root node of top tree */
       if( k <= bottom) {
               if( k < bottom && cmptrees(list[k], list[k + 1]))
                       ++k;

               /* k indexes "smaller" child (in heap of trees) of top */
               /* now make top index "smaller" of old top and smallest child */
               if(cmptrees(temp, list[k])) {
                       list[top] = list[k];
                       list[k] = temp;
                       /* Make the changed list a heap */
                       adjust(list, k, bottom); /*recursive*/
               }
       }
}

/* Compare two trees, if a > b return true, else return false
* note comparison rules in previous comments.
*/

char    /* Boolean */
cmptrees(a, b)
int a, b;       /* root nodes of trees */
{
       if(node[a].weight > node[b].weight)
               return TRUE;
       if(node[a].weight == node[b].weight)
               if(node[a].tdepth > node[b].tdepth)
                       return TRUE;
       return FALSE;
}
/* */
/* HUFFMAN ALGORITHM: develops the single element trees
* into a single binary tree by forming subtrees rooted in
* interior nodes having weights equal to the sum of weights of all
* their descendents and having depth counts indicating the
* depth of their longest paths.
*
* When all trees have been formed into a single tree satisfying
* the heap property (on weight, with depth as a tie breaker)
* then the binary code assigned to a leaf (value to be encoded)
* is then the series of left (0) and right (1)
* paths leading from the root to the leaf.
* Note that trees are removed from the heaped list by
* moving the last element over the top element and
* reheaping the shorter list.
*/

bld_tree(list, len)
int list[];
int len;
{
       int freenode;           /* next free node in tree */
       int lch, rch;           /* temporaries for left, right children */
       struct nd *frnp;        /* free node pointer */
       int i;
       char maxchar();

       /* Initialize index to next available (non-leaf) node.
        * Lower numbered nodes correspond to leaves (data values).
        */
       freenode = NUMVALS;

       while(len > 1) {
               /* Take from list two btrees with least weight
                * and build an interior node pointing to them.
                * This forms a new tree.
                */
               lch = list[0];  /* This one will be left child */

               /* delete top (least) tree from the list of trees */
               list[0] = list[--len];
               adjust(list, 0, len - 1);

               /* Take new top (least) tree. Reuse list slot later */
               rch = list[0];  /* This one will be right child */

               /* Form new tree from the two least trees using
                * a free node as root. Put the new tree in the list.
                */
               frnp = &node[freenode]; /* address of next free node */
               list[0] = freenode++;   /* put at top for now */
               frnp->lchild = lch;
               frnp->rchild = rch;
               frnp->weight = node[lch].weight + node[rch].weight;
               frnp->tdepth = 1 + maxchar(node[lch].tdepth, node[rch].tdepth);
               /* reheap list  to get least tree at top*/
               adjust(list, 0, len - 1);
       }
       dctreehd = list[0];     /*head of final tree */
}
/* */
char
maxchar(a, b)
char a, b;
{
       return a > b ? a : b;
}
/* Initialize all nodes to single element binary trees
* with zero weight and depth.
*/

init_tree()
{
       int i;

       for(i = 0; i < NUMNODES; ++i) {
               node[i].weight = 0;
               node[i].tdepth = 0;
               node[i].lchild = NOCHILD;
               node[i].rchild = NOCHILD;
       }
}

init_enc()
{
       int i;

       /* Initialize encoding table */
       for(i = 0; i < NUMVALS; ++i) {
               codelen[i] = 0;
       }
}
/* */
/* Recursive routine to walk the indicated subtree and level
* and maintain the current path code in bstree. When a leaf
* is found the entire code string and length are put into
* the encoding table entry for the leaf's data value.
*
* Returns ERROR if codes are too long.
*/

int             /* returns ERROR or NULL */
buildenc(level, root)
int level;/* level of tree being examined, from zero */
int root; /* root of subtree is also data value if leaf */
{
       int l, r;

       l = node[root].lchild;
       r = node[root].rchild;
#ifdef DEBUG
       if (debug) printf("level=%d,root=%d,l=%d,r=%d,tcode=%04x\n",level,root,l,r,tcode);
#endif
       if( l == NOCHILD && r == NOCHILD) {
               /* Leaf. Previous path determines bit string
                * code of length level (bits 0 to level - 1).
                * Ensures unused code bits are zero.
                */
               codelen[root] = level;
               code[root] = tcode & ((unsigned int)(~0) >> ((sizeof(int) * 8) - level));
#ifdef DEBUG
               if (debug) printf("  codelen[%d]=%d,code[%d]=%02x\n",root,codelen[root],root,code[root]);
#endif
               return (level > 16) ? ERROR : NULL;
       } else {
               if( l != NOCHILD) {
                       /* Clear path bit and continue deeper */
                       tcode &= ~(1 << level);
                       /* NOTE RECURSION */
                       if(buildenc(level + 1, l) == ERROR)
                               return ERROR;
               }
               if(r != NOCHILD) {
                       /* Set path bit and continue deeper */
                       tcode |= 1 << level;
                       /* NOTE RECURSION */
                       if(buildenc(level + 1, r) == ERROR)
                               return ERROR;
               }
       }
       return NULL;    /* if we got here we're ok so far */
}
/* */
/* Write out the header of the compressed file */

wrt_head(ob, infile)
FILE *ob;
char *infile;   /* input file name (w/ or w/o drive) */
{
       int i, k, l, r;
       int numnodes;           /* nbr of nodes in simplified tree */

       putwe(RECOGNIZE, ob);   /* identifies as compressed */
       putwe(crc, ob);         /* unsigned sum of original data */

       /* Record the original file name w/o drive */
       if(*(infile + 1) == ':')
               infile += 2;    /* skip drive */

       do {
               putce(*infile, ob);
       } while(*(infile++) != '\0');


       /* Write out a simplified decoding tree. Only the interior
        * nodes are written. When a child is a leaf index
        * (representing a data value) it is recoded as
        * -(index + 1) to distinguish it from interior indexes
        * which are recoded as positive indexes in the new tree.
        * Note that this tree will be empty for an empty file.
        */

       numnodes = dctreehd < NUMVALS ? 0 : dctreehd - (NUMVALS -1);
       putwe(numnodes, ob);

       for(k = 0, i = dctreehd; k < numnodes; ++k, --i) {
               l = node[i].lchild;
               r = node[i].rchild;
               l = l < NUMVALS ? -(l + 1) : dctreehd - l;
               r = r < NUMVALS ? -(r + 1) : dctreehd - r;
               putwe(l, ob);   /* left child */
               putwe(r, ob);   /* right child */
       }
}
/* */
/* Get an encoded byte or EOF. Reads from specified stream AS NEEDED.
*
* There are two unsynchronized bit-byte relationships here.
* The input stream bytes are converted to bit strings of
* various lengths via the static variables named c...
* These bit strings are concatenated without padding to
* become the stream of encoded result bytes, which this
* function returns one at a time. The EOF (end of file) is
* converted to SPEOF for convenience and encoded like any
* other input value. True EOF is returned after that.
*
* The original gethuff() called a seperate function,
* getbit(), but that more readable version was too slow.
*/

int                             /*  Returns byte values except for EOF */
gethuff(ib)
FILE *ib;
{
       unsigned int rbyte;     /* Result byte value */
       char need, take;        /* numbers of bits */

       rbyte = 0;
       need = 8;               /* build one byte per call */

       /* Loop to build a byte of encoded data
        * Initialization forces read the first time
        */

loop:
       if(cbitsrem >= need) {
               /* Current code fullfills our needs */
               if(need == 0)
                       return rbyte & 0xff;
               /* Take what we need */
               rbyte |= ccode << (8 - need);
               /* And leave the rest */
               ccode >>= need;
               cbitsrem -= need;
               return rbyte & 0xff;
       }

       /* We need more than current code */
       if(cbitsrem > 0) {
               /* Take what there is */
               rbyte |= ccode << (8 - need);
               need -= cbitsrem;
       }
       /* No more bits in current code string */
       if(curin == SPEOF) {
               /* The end of file token has been encoded. If
                * result byte has data return it and do EOF next time
                */
               cbitsrem = 0;

               return (need == 8) ? EOF : rbyte & 0xff;
       }

       /* Get an input byte */
       if((curin = getcnr(ib)) == EOF)
               curin = SPEOF;  /* convenient for encoding */

       /* Get the new byte's code */
       ccode = code[curin];
       cbitsrem = codelen[curin];

       goto loop;
}