| diffreg.c - 9base - revived minimalist port of Plan 9 userland to Unix | |
| git clone git://git.suckless.org/9base | |
| Log | |
| Files | |
| Refs | |
| README | |
| LICENSE | |
| --- | |
| diffreg.c (8845B) | |
| --- | |
| 1 #include <u.h> | |
| 2 #include <libc.h> | |
| 3 #include <bio.h> | |
| 4 #include "diff.h" | |
| 5 | |
| 6 /* diff - differential file comparison | |
| 7 * | |
| 8 * Uses an algorithm due to Harold Stone, which finds | |
| 9 * a pair of longest identical subsequences in the two | |
| 10 * files. | |
| 11 * | |
| 12 * The major goal is to generate the match vector J. | |
| 13 * J[i] is the index of the line in file1 corresponding | |
| 14 * to line i file0. J[i] = 0 if there is no | |
| 15 * such line in file1. | |
| 16 * | |
| 17 * Lines are hashed so as to work in core. All potential | |
| 18 * matches are located by sorting the lines of each file | |
| 19 * on the hash (called value). In particular, this | |
| 20 * collects the equivalence classes in file1 together. | |
| 21 * Subroutine equiv replaces the value of each line in | |
| 22 * file0 by the index of the first element of its | |
| 23 * matching equivalence in (the reordered) file1. | |
| 24 * To save space equiv squeezes file1 into a single | |
| 25 * array member in which the equivalence classes | |
| 26 * are simply concatenated, except that their first | |
| 27 * members are flagged by changing sign. | |
| 28 * | |
| 29 * Next the indices that point into member are unsorted into | |
| 30 * array class according to the original order of file0. | |
| 31 * | |
| 32 * The cleverness lies in routine stone. This marches | |
| 33 * through the lines of file0, developing a vector klist | |
| 34 * of "k-candidates". At step i a k-candidate is a matched | |
| 35 * pair of lines x,y (x in file0 y in file1) such that | |
| 36 * there is a common subsequence of lenght k | |
| 37 * between the first i lines of file0 and the first y | |
| 38 * lines of file1, but there is no such subsequence for | |
| 39 * any smaller y. x is the earliest possible mate to y | |
| 40 * that occurs in such a subsequence. | |
| 41 * | |
| 42 * Whenever any of the members of the equivalence class of | |
| 43 * lines in file1 matable to a line in file0 has serial number | |
| 44 * less than the y of some k-candidate, that k-candidate | |
| 45 * with the smallest such y is replaced. The new | |
| 46 * k-candidate is chained (via pred) to the current | |
| 47 * k-1 candidate so that the actual subsequence can | |
| 48 * be recovered. When a member has serial number greater | |
| 49 * that the y of all k-candidates, the klist is extended. | |
| 50 * At the end, the longest subsequence is pulled out | |
| 51 * and placed in the array J by unravel. | |
| 52 * | |
| 53 * With J in hand, the matches there recorded are | |
| 54 * check'ed against reality to assure that no spurious | |
| 55 * matches have crept in due to hashing. If they have, | |
| 56 * they are broken, and "jackpot " is recorded--a harmless | |
| 57 * matter except that a true match for a spuriously | |
| 58 * mated line may now be unnecessarily reported as a change. | |
| 59 * | |
| 60 * Much of the complexity of the program comes simply | |
| 61 * from trying to minimize core utilization and | |
| 62 * maximize the range of doable problems by dynamically | |
| 63 * allocating what is needed and reusing what is not. | |
| 64 * The core requirements for problems larger than somewhat | |
| 65 * are (in words) 2*length(file0) + length(file1) + | |
| 66 * 3*(number of k-candidates installed), typically about | |
| 67 * 6n words for files of length n. | |
| 68 */ | |
| 69 /* TIDY THIS UP */ | |
| 70 struct cand { | |
| 71 int x; | |
| 72 int y; | |
| 73 int pred; | |
| 74 } cand; | |
| 75 struct line { | |
| 76 int serial; | |
| 77 int value; | |
| 78 } *file[2], line; | |
| 79 int len[2]; | |
| 80 int binary; | |
| 81 struct line *sfile[2]; /*shortened by pruning common prefix and s… | |
| 82 int slen[2]; | |
| 83 int pref, suff; /*length of prefix and suffix*/ | |
| 84 int *class; /*will be overlaid on file[0]*/ | |
| 85 int *member; /*will be overlaid on file[1]*/ | |
| 86 int *klist; /*will be overlaid on file[0] after class*/ | |
| 87 struct cand *clist; /* merely a free storage pot for candidates */ | |
| 88 int clen; | |
| 89 int *J; /*will be overlaid on class*/ | |
| 90 long *ixold; /*will be overlaid on klist*/ | |
| 91 long *ixnew; /*will be overlaid on file[1]*/ | |
| 92 /* END OF SOME TIDYING */ | |
| 93 | |
| 94 static void | |
| 95 sort(struct line *a, int n) /*shellsort CACM #201*/ | |
| 96 { | |
| 97 int m; | |
| 98 struct line *ai, *aim, *j, *k; | |
| 99 struct line w; | |
| 100 int i; | |
| 101 | |
| 102 m = 0; | |
| 103 for (i = 1; i <= n; i *= 2) | |
| 104 m = 2*i - 1; | |
| 105 for (m /= 2; m != 0; m /= 2) { | |
| 106 k = a+(n-m); | |
| 107 for (j = a+1; j <= k; j++) { | |
| 108 ai = j; | |
| 109 aim = ai+m; | |
| 110 do { | |
| 111 if (aim->value > ai->value || | |
| 112 aim->value == ai->value && | |
| 113 aim->serial > ai->serial) | |
| 114 break; | |
| 115 w = *ai; | |
| 116 *ai = *aim; | |
| 117 *aim = w; | |
| 118 | |
| 119 aim = ai; | |
| 120 ai -= m; | |
| 121 } while (ai > a && aim >= ai); | |
| 122 } | |
| 123 } | |
| 124 } | |
| 125 | |
| 126 static void | |
| 127 unsort(struct line *f, int l, int *b) | |
| 128 { | |
| 129 int *a; | |
| 130 int i; | |
| 131 | |
| 132 a = MALLOC(int, (l+1)); | |
| 133 for(i=1;i<=l;i++) | |
| 134 a[f[i].serial] = f[i].value; | |
| 135 for(i=1;i<=l;i++) | |
| 136 b[i] = a[i]; | |
| 137 FREE(a); | |
| 138 } | |
| 139 | |
| 140 static void | |
| 141 prune(void) | |
| 142 { | |
| 143 int i,j; | |
| 144 | |
| 145 for(pref=0;pref<len[0]&&pref<len[1]&& | |
| 146 file[0][pref+1].value==file[1][pref+1].value; | |
| 147 pref++ ) ; | |
| 148 for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&& | |
| 149 file[0][len[0]-suff].value==file[1][len[1]-suff].value; | |
| 150 suff++) ; | |
| 151 for(j=0;j<2;j++) { | |
| 152 sfile[j] = file[j]+pref; | |
| 153 slen[j] = len[j]-pref-suff; | |
| 154 for(i=0;i<=slen[j];i++) | |
| 155 sfile[j][i].serial = i; | |
| 156 } | |
| 157 } | |
| 158 | |
| 159 static void | |
| 160 equiv(struct line *a, int n, struct line *b, int m, int *c) | |
| 161 { | |
| 162 int i, j; | |
| 163 | |
| 164 i = j = 1; | |
| 165 while(i<=n && j<=m) { | |
| 166 if(a[i].value < b[j].value) | |
| 167 a[i++].value = 0; | |
| 168 else if(a[i].value == b[j].value) | |
| 169 a[i++].value = j; | |
| 170 else | |
| 171 j++; | |
| 172 } | |
| 173 while(i <= n) | |
| 174 a[i++].value = 0; | |
| 175 b[m+1].value = 0; | |
| 176 j = 0; | |
| 177 while(++j <= m) { | |
| 178 c[j] = -b[j].serial; | |
| 179 while(b[j+1].value == b[j].value) { | |
| 180 j++; | |
| 181 c[j] = b[j].serial; | |
| 182 } | |
| 183 } | |
| 184 c[j] = -1; | |
| 185 } | |
| 186 | |
| 187 static int | |
| 188 newcand(int x, int y, int pred) | |
| 189 { | |
| 190 struct cand *q; | |
| 191 | |
| 192 clist = REALLOC(clist, struct cand, (clen+1)); | |
| 193 q = clist + clen; | |
| 194 q->x = x; | |
| 195 q->y = y; | |
| 196 q->pred = pred; | |
| 197 return clen++; | |
| 198 } | |
| 199 | |
| 200 static int | |
| 201 search(int *c, int k, int y) | |
| 202 { | |
| 203 int i, j, l; | |
| 204 int t; | |
| 205 | |
| 206 if(clist[c[k]].y < y) /*quick look for typical case*/ | |
| 207 return k+1; | |
| 208 i = 0; | |
| 209 j = k+1; | |
| 210 while((l=(i+j)/2) > i) { | |
| 211 t = clist[c[l]].y; | |
| 212 if(t > y) | |
| 213 j = l; | |
| 214 else if(t < y) | |
| 215 i = l; | |
| 216 else | |
| 217 return l; | |
| 218 } | |
| 219 return l+1; | |
| 220 } | |
| 221 | |
| 222 static int | |
| 223 stone(int *a, int n, int *b, int *c) | |
| 224 { | |
| 225 int i, k,y; | |
| 226 int j, l; | |
| 227 int oldc, tc; | |
| 228 int oldl; | |
| 229 | |
| 230 k = 0; | |
| 231 c[0] = newcand(0,0,0); | |
| 232 for(i=1; i<=n; i++) { | |
| 233 j = a[i]; | |
| 234 if(j==0) | |
| 235 continue; | |
| 236 y = -b[j]; | |
| 237 oldl = 0; | |
| 238 oldc = c[0]; | |
| 239 do { | |
| 240 if(y <= clist[oldc].y) | |
| 241 continue; | |
| 242 l = search(c, k, y); | |
| 243 if(l!=oldl+1) | |
| 244 oldc = c[l-1]; | |
| 245 if(l<=k) { | |
| 246 if(clist[c[l]].y <= y) | |
| 247 continue; | |
| 248 tc = c[l]; | |
| 249 c[l] = newcand(i,y,oldc); | |
| 250 oldc = tc; | |
| 251 oldl = l; | |
| 252 } else { | |
| 253 c[l] = newcand(i,y,oldc); | |
| 254 k++; | |
| 255 break; | |
| 256 } | |
| 257 } while((y=b[++j]) > 0); | |
| 258 } | |
| 259 return k; | |
| 260 } | |
| 261 | |
| 262 static void | |
| 263 unravel(int p) | |
| 264 { | |
| 265 int i; | |
| 266 struct cand *q; | |
| 267 | |
| 268 for(i=0; i<=len[0]; i++) { | |
| 269 if (i <= pref) | |
| 270 J[i] = i; | |
| 271 else if (i > len[0]-suff) | |
| 272 J[i] = i+len[1]-len[0]; | |
| 273 else | |
| 274 J[i] = 0; | |
| 275 } | |
| 276 for(q=clist+p;q->y!=0;q=clist+q->pred) | |
| 277 J[q->x+pref] = q->y+pref; | |
| 278 } | |
| 279 | |
| 280 static void | |
| 281 output(void) | |
| 282 { | |
| 283 int m, i0, i1, j0, j1; | |
| 284 | |
| 285 m = len[0]; | |
| 286 J[0] = 0; | |
| 287 J[m+1] = len[1]+1; | |
| 288 if (mode != 'e') { | |
| 289 for (i0 = 1; i0 <= m; i0 = i1+1) { | |
| 290 while (i0 <= m && J[i0] == J[i0-1]+1) | |
| 291 i0++; | |
| 292 j0 = J[i0-1]+1; | |
| 293 i1 = i0-1; | |
| 294 while (i1 < m && J[i1+1] == 0) | |
| 295 i1++; | |
| 296 j1 = J[i1+1]-1; | |
| 297 J[i1] = j1; | |
| 298 change(i0, i1, j0, j1); | |
| 299 } | |
| 300 } | |
| 301 else { | |
| 302 for (i0 = m; i0 >= 1; i0 = i1-1) { | |
| 303 while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0]) | |
| 304 i0--; | |
| 305 j0 = J[i0+1]-1; | |
| 306 i1 = i0+1; | |
| 307 while (i1 > 1 && J[i1-1] == 0) | |
| 308 i1--; | |
| 309 j1 = J[i1-1]+1; | |
| 310 J[i1] = j1; | |
| 311 change(i1 , i0, j1, j0); | |
| 312 } | |
| 313 } | |
| 314 if (m == 0) | |
| 315 change(1, 0, 1, len[1]); | |
| 316 flushchanges(); | |
| 317 } | |
| 318 | |
| 319 #define BUF 4096 | |
| 320 static int | |
| 321 cmp(Biobuf* b1, Biobuf* b2) | |
| 322 { | |
| 323 int n; | |
| 324 uchar buf1[BUF], buf2[BUF]; | |
| 325 int f1, f2; | |
| 326 vlong nc = 1; | |
| 327 uchar *b1s, *b1e, *b2s, *b2e; | |
| 328 | |
| 329 f1 = Bfildes(b1); | |
| 330 f2 = Bfildes(b2); | |
| 331 seek(f1, 0, 0); | |
| 332 seek(f2, 0, 0); | |
| 333 b1s = b1e = buf1; | |
| 334 b2s = b2e = buf2; | |
| 335 for(;;){ | |
| 336 if(b1s >= b1e){ | |
| 337 if(b1s >= &buf1[BUF]) | |
| 338 b1s = buf1; | |
| 339 n = read(f1, b1s, &buf1[BUF] - b1s); | |
| 340 b1e = b1s + n; | |
| 341 } | |
| 342 if(b2s >= b2e){ | |
| 343 if(b2s >= &buf2[BUF]) | |
| 344 b2s = buf2; | |
| 345 n = read(f2, b2s, &buf2[BUF] - b2s); | |
| 346 b2e = b2s + n; | |
| 347 } | |
| 348 n = b2e - b2s; | |
| 349 if(n > b1e - b1s) | |
| 350 n = b1e - b1s; | |
| 351 if(n <= 0) | |
| 352 break; | |
| 353 if(memcmp((void *)b1s, (void *)b2s, n) != 0){ | |
| 354 return 1; | |
| 355 } | |
| 356 nc += n; | |
| 357 b1s += n; | |
| 358 b2s += n; | |
| 359 } | |
| 360 if(b1e - b1s == b2e - b2s) | |
| 361 return 0; | |
| 362 return 1; | |
| 363 } | |
| 364 | |
| 365 void | |
| 366 diffreg(char *f, char *t) | |
| 367 { | |
| 368 Biobuf *b0, *b1; | |
| 369 int k; | |
| 370 | |
| 371 binary = 0; | |
| 372 b0 = prepare(0, f); | |
| 373 if (!b0) | |
| 374 return; | |
| 375 b1 = prepare(1, t); | |
| 376 if (!b1) { | |
| 377 FREE(file[0]); | |
| 378 Bterm(b0); | |
| 379 return; | |
| 380 } | |
| 381 if (binary){ | |
| 382 /* could use b0 and b1 but this is simpler. */ | |
| 383 if (cmp(b0, b1)) | |
| 384 print("binary files %s %s differ\n", f, t); | |
| 385 Bterm(b0); | |
| 386 Bterm(b1); | |
| 387 return; | |
| 388 } | |
| 389 clen = 0; | |
| 390 prune(); | |
| 391 sort(sfile[0], slen[0]); | |
| 392 sort(sfile[1], slen[1]); | |
| 393 | |
| 394 member = (int *)file[1]; | |
| 395 equiv(sfile[0], slen[0], sfile[1], slen[1], member); | |
| 396 member = REALLOC(member, int, slen[1]+2); | |
| 397 | |
| 398 class = (int *)file[0]; | |
| 399 unsort(sfile[0], slen[0], class); | |
| 400 class = REALLOC(class, int, slen[0]+2); | |
| 401 | |
| 402 klist = MALLOC(int, slen[0]+2); | |
| 403 clist = MALLOC(struct cand, 1); | |
| 404 k = stone(class, slen[0], member, klist); | |
| 405 FREE(member); | |
| 406 FREE(class); | |
| 407 | |
| 408 J = MALLOC(int, len[0]+2); | |
| 409 unravel(klist[k]); | |
| 410 FREE(clist); | |
| 411 FREE(klist); | |
| 412 | |
| 413 ixold = MALLOC(long, len[0]+2); | |
| 414 ixnew = MALLOC(long, len[1]+2); | |
| 415 Bseek(b0, 0, 0); Bseek(b1, 0, 0); | |
| 416 check(b0, b1); | |
| 417 output(); | |
| 418 FREE(J); FREE(ixold); FREE(ixnew); | |
| 419 Bterm(b0); Bterm(b1); /* ++++ */ | |
| 420 } |