/* diff - differential file comparison
*
* Uses an algorithm due to Harold Stone, which finds
* a pair of longest identical subsequences in the two
* files.
*
* The major goal is to generate the match vector J.
* J[i] is the index of the line in file1 corresponding
* to line i file0. J[i] = 0 if there is no
* such line in file1.
*
* Lines are hashed so as to work in core. All potential
* matches are located by sorting the lines of each file
* on the hash (called value). In particular, this
* collects the equivalence classes in file1 together.
* Subroutine equiv replaces the value of each line in
* file0 by the index of the first element of its
* matching equivalence in (the reordered) file1.
* To save space equiv squeezes file1 into a single
* array member in which the equivalence classes
* are simply concatenated, except that their first
* members are flagged by changing sign.
*
* Next the indices that point into member are unsorted into
* array class according to the original order of file0.
*
* The cleverness lies in routine stone. This marches
* through the lines of file0, developing a vector klist
* of "k-candidates". At step i a k-candidate is a matched
* pair of lines x,y (x in file0 y in file1) such that
* there is a common subsequence of lenght k
* between the first i lines of file0 and the first y
* lines of file1, but there is no such subsequence for
* any smaller y. x is the earliest possible mate to y
* that occurs in such a subsequence.
*
* Whenever any of the members of the equivalence class of
* lines in file1 matable to a line in file0 has serial number
* less than the y of some k-candidate, that k-candidate
* with the smallest such y is replaced. The new
* k-candidate is chained (via pred) to the current
* k-1 candidate so that the actual subsequence can
* be recovered. When a member has serial number greater
* that the y of all k-candidates, the klist is extended.
* At the end, the longest subsequence is pulled out
* and placed in the array J by unravel.
*
* With J in hand, the matches there recorded are
* check'ed against reality to assure that no spurious
* matches have crept in due to hashing. If they have,
* they are broken, and "jackpot " is recorded--a harmless
* matter except that a true match for a spuriously
* mated line may now be unnecessarily reported as a change.
*
* Much of the complexity of the program comes simply
* from trying to minimize core utilization and
* maximize the range of doable problems by dynamically
* allocating what is needed and reusing what is not.
* The core requirements for problems larger than somewhat
* are (in words) 2*length(file0) + length(file1) +
* 3*(number of k-candidates installed), typically about
* 6n words for files of length n.
*/
/* TIDY THIS UP */
struct cand {
int x;
int y;
int pred;
} cand;
struct line {
int serial;
int value;
} *file[2], line;
int len[2];
int binary;
struct line *sfile[2]; /*shortened by pruning common prefix and suffix*/
int slen[2];
int pref, suff; /*length of prefix and suffix*/
int *class; /*will be overlaid on file[0]*/
int *member; /*will be overlaid on file[1]*/
int *klist; /*will be overlaid on file[0] after class*/
struct cand *clist; /* merely a free storage pot for candidates */
int clen;
int *J; /*will be overlaid on class*/
long *ixold; /*will be overlaid on klist*/
long *ixnew; /*will be overlaid on file[1]*/
/* END OF SOME TIDYING */
static void
sort(struct line *a, int n) /*shellsort CACM #201*/
{
int m;
struct line *ai, *aim, *j, *k;
struct line w;
int i;
m = 0;
for (i = 1; i <= n; i *= 2)
m = 2*i - 1;
for (m /= 2; m != 0; m /= 2) {
k = a+(n-m);
for (j = a+1; j <= k; j++) {
ai = j;
aim = ai+m;
do {
if (aim->value > ai->value ||
aim->value == ai->value &&
aim->serial > ai->serial)
break;
w = *ai;
*ai = *aim;
*aim = w;
aim = ai;
ai -= m;
} while (ai > a && aim >= ai);
}
}
}
static void
unsort(struct line *f, int l, int *b)
{
int *a;
int i;