Introduction
Introduction Statistics Contact Development Disclaimer Help
diffreg.c - 9base - revived minimalist port of Plan 9 userland to Unix
git clone git://git.suckless.org/9base
Log
Files
Refs
README
LICENSE
---
diffreg.c (8845B)
---
1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include "diff.h"
5
6 /* diff - differential file comparison
7 *
8 * Uses an algorithm due to Harold Stone, which finds
9 * a pair of longest identical subsequences in the two
10 * files.
11 *
12 * The major goal is to generate the match vector J.
13 * J[i] is the index of the line in file1 corresponding
14 * to line i file0. J[i] = 0 if there is no
15 * such line in file1.
16 *
17 * Lines are hashed so as to work in core. All potential
18 * matches are located by sorting the lines of each file
19 * on the hash (called value). In particular, this
20 * collects the equivalence classes in file1 together.
21 * Subroutine equiv replaces the value of each line in
22 * file0 by the index of the first element of its
23 * matching equivalence in (the reordered) file1.
24 * To save space equiv squeezes file1 into a single
25 * array member in which the equivalence classes
26 * are simply concatenated, except that their first
27 * members are flagged by changing sign.
28 *
29 * Next the indices that point into member are unsorted into
30 * array class according to the original order of file0.
31 *
32 * The cleverness lies in routine stone. This marches
33 * through the lines of file0, developing a vector klist
34 * of "k-candidates". At step i a k-candidate is a matched
35 * pair of lines x,y (x in file0 y in file1) such that
36 * there is a common subsequence of lenght k
37 * between the first i lines of file0 and the first y
38 * lines of file1, but there is no such subsequence for
39 * any smaller y. x is the earliest possible mate to y
40 * that occurs in such a subsequence.
41 *
42 * Whenever any of the members of the equivalence class of
43 * lines in file1 matable to a line in file0 has serial number
44 * less than the y of some k-candidate, that k-candidate
45 * with the smallest such y is replaced. The new
46 * k-candidate is chained (via pred) to the current
47 * k-1 candidate so that the actual subsequence can
48 * be recovered. When a member has serial number greater
49 * that the y of all k-candidates, the klist is extended.
50 * At the end, the longest subsequence is pulled out
51 * and placed in the array J by unravel.
52 *
53 * With J in hand, the matches there recorded are
54 * check'ed against reality to assure that no spurious
55 * matches have crept in due to hashing. If they have,
56 * they are broken, and "jackpot " is recorded--a harmless
57 * matter except that a true match for a spuriously
58 * mated line may now be unnecessarily reported as a change.
59 *
60 * Much of the complexity of the program comes simply
61 * from trying to minimize core utilization and
62 * maximize the range of doable problems by dynamically
63 * allocating what is needed and reusing what is not.
64 * The core requirements for problems larger than somewhat
65 * are (in words) 2*length(file0) + length(file1) +
66 * 3*(number of k-candidates installed), typically about
67 * 6n words for files of length n.
68 */
69 /* TIDY THIS UP */
70 struct cand {
71 int x;
72 int y;
73 int pred;
74 } cand;
75 struct line {
76 int serial;
77 int value;
78 } *file[2], line;
79 int len[2];
80 int binary;
81 struct line *sfile[2]; /*shortened by pruning common prefix and s…
82 int slen[2];
83 int pref, suff; /*length of prefix and suffix*/
84 int *class; /*will be overlaid on file[0]*/
85 int *member; /*will be overlaid on file[1]*/
86 int *klist; /*will be overlaid on file[0] after class*/
87 struct cand *clist; /* merely a free storage pot for candidates */
88 int clen;
89 int *J; /*will be overlaid on class*/
90 long *ixold; /*will be overlaid on klist*/
91 long *ixnew; /*will be overlaid on file[1]*/
92 /* END OF SOME TIDYING */
93
94 static void
95 sort(struct line *a, int n) /*shellsort CACM #201*/
96 {
97 int m;
98 struct line *ai, *aim, *j, *k;
99 struct line w;
100 int i;
101
102 m = 0;
103 for (i = 1; i <= n; i *= 2)
104 m = 2*i - 1;
105 for (m /= 2; m != 0; m /= 2) {
106 k = a+(n-m);
107 for (j = a+1; j <= k; j++) {
108 ai = j;
109 aim = ai+m;
110 do {
111 if (aim->value > ai->value ||
112 aim->value == ai->value &&
113 aim->serial > ai->serial)
114 break;
115 w = *ai;
116 *ai = *aim;
117 *aim = w;
118
119 aim = ai;
120 ai -= m;
121 } while (ai > a && aim >= ai);
122 }
123 }
124 }
125
126 static void
127 unsort(struct line *f, int l, int *b)
128 {
129 int *a;
130 int i;
131
132 a = MALLOC(int, (l+1));
133 for(i=1;i<=l;i++)
134 a[f[i].serial] = f[i].value;
135 for(i=1;i<=l;i++)
136 b[i] = a[i];
137 FREE(a);
138 }
139
140 static void
141 prune(void)
142 {
143 int i,j;
144
145 for(pref=0;pref<len[0]&&pref<len[1]&&
146 file[0][pref+1].value==file[1][pref+1].value;
147 pref++ ) ;
148 for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&&
149 file[0][len[0]-suff].value==file[1][len[1]-suff].value;
150 suff++) ;
151 for(j=0;j<2;j++) {
152 sfile[j] = file[j]+pref;
153 slen[j] = len[j]-pref-suff;
154 for(i=0;i<=slen[j];i++)
155 sfile[j][i].serial = i;
156 }
157 }
158
159 static void
160 equiv(struct line *a, int n, struct line *b, int m, int *c)
161 {
162 int i, j;
163
164 i = j = 1;
165 while(i<=n && j<=m) {
166 if(a[i].value < b[j].value)
167 a[i++].value = 0;
168 else if(a[i].value == b[j].value)
169 a[i++].value = j;
170 else
171 j++;
172 }
173 while(i <= n)
174 a[i++].value = 0;
175 b[m+1].value = 0;
176 j = 0;
177 while(++j <= m) {
178 c[j] = -b[j].serial;
179 while(b[j+1].value == b[j].value) {
180 j++;
181 c[j] = b[j].serial;
182 }
183 }
184 c[j] = -1;
185 }
186
187 static int
188 newcand(int x, int y, int pred)
189 {
190 struct cand *q;
191
192 clist = REALLOC(clist, struct cand, (clen+1));
193 q = clist + clen;
194 q->x = x;
195 q->y = y;
196 q->pred = pred;
197 return clen++;
198 }
199
200 static int
201 search(int *c, int k, int y)
202 {
203 int i, j, l;
204 int t;
205
206 if(clist[c[k]].y < y) /*quick look for typical case*/
207 return k+1;
208 i = 0;
209 j = k+1;
210 while((l=(i+j)/2) > i) {
211 t = clist[c[l]].y;
212 if(t > y)
213 j = l;
214 else if(t < y)
215 i = l;
216 else
217 return l;
218 }
219 return l+1;
220 }
221
222 static int
223 stone(int *a, int n, int *b, int *c)
224 {
225 int i, k,y;
226 int j, l;
227 int oldc, tc;
228 int oldl;
229
230 k = 0;
231 c[0] = newcand(0,0,0);
232 for(i=1; i<=n; i++) {
233 j = a[i];
234 if(j==0)
235 continue;
236 y = -b[j];
237 oldl = 0;
238 oldc = c[0];
239 do {
240 if(y <= clist[oldc].y)
241 continue;
242 l = search(c, k, y);
243 if(l!=oldl+1)
244 oldc = c[l-1];
245 if(l<=k) {
246 if(clist[c[l]].y <= y)
247 continue;
248 tc = c[l];
249 c[l] = newcand(i,y,oldc);
250 oldc = tc;
251 oldl = l;
252 } else {
253 c[l] = newcand(i,y,oldc);
254 k++;
255 break;
256 }
257 } while((y=b[++j]) > 0);
258 }
259 return k;
260 }
261
262 static void
263 unravel(int p)
264 {
265 int i;
266 struct cand *q;
267
268 for(i=0; i<=len[0]; i++) {
269 if (i <= pref)
270 J[i] = i;
271 else if (i > len[0]-suff)
272 J[i] = i+len[1]-len[0];
273 else
274 J[i] = 0;
275 }
276 for(q=clist+p;q->y!=0;q=clist+q->pred)
277 J[q->x+pref] = q->y+pref;
278 }
279
280 static void
281 output(void)
282 {
283 int m, i0, i1, j0, j1;
284
285 m = len[0];
286 J[0] = 0;
287 J[m+1] = len[1]+1;
288 if (mode != 'e') {
289 for (i0 = 1; i0 <= m; i0 = i1+1) {
290 while (i0 <= m && J[i0] == J[i0-1]+1)
291 i0++;
292 j0 = J[i0-1]+1;
293 i1 = i0-1;
294 while (i1 < m && J[i1+1] == 0)
295 i1++;
296 j1 = J[i1+1]-1;
297 J[i1] = j1;
298 change(i0, i1, j0, j1);
299 }
300 }
301 else {
302 for (i0 = m; i0 >= 1; i0 = i1-1) {
303 while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0])
304 i0--;
305 j0 = J[i0+1]-1;
306 i1 = i0+1;
307 while (i1 > 1 && J[i1-1] == 0)
308 i1--;
309 j1 = J[i1-1]+1;
310 J[i1] = j1;
311 change(i1 , i0, j1, j0);
312 }
313 }
314 if (m == 0)
315 change(1, 0, 1, len[1]);
316 flushchanges();
317 }
318
319 #define BUF 4096
320 static int
321 cmp(Biobuf* b1, Biobuf* b2)
322 {
323 int n;
324 uchar buf1[BUF], buf2[BUF];
325 int f1, f2;
326 vlong nc = 1;
327 uchar *b1s, *b1e, *b2s, *b2e;
328
329 f1 = Bfildes(b1);
330 f2 = Bfildes(b2);
331 seek(f1, 0, 0);
332 seek(f2, 0, 0);
333 b1s = b1e = buf1;
334 b2s = b2e = buf2;
335 for(;;){
336 if(b1s >= b1e){
337 if(b1s >= &buf1[BUF])
338 b1s = buf1;
339 n = read(f1, b1s, &buf1[BUF] - b1s);
340 b1e = b1s + n;
341 }
342 if(b2s >= b2e){
343 if(b2s >= &buf2[BUF])
344 b2s = buf2;
345 n = read(f2, b2s, &buf2[BUF] - b2s);
346 b2e = b2s + n;
347 }
348 n = b2e - b2s;
349 if(n > b1e - b1s)
350 n = b1e - b1s;
351 if(n <= 0)
352 break;
353 if(memcmp((void *)b1s, (void *)b2s, n) != 0){
354 return 1;
355 }
356 nc += n;
357 b1s += n;
358 b2s += n;
359 }
360 if(b1e - b1s == b2e - b2s)
361 return 0;
362 return 1;
363 }
364
365 void
366 diffreg(char *f, char *t)
367 {
368 Biobuf *b0, *b1;
369 int k;
370
371 binary = 0;
372 b0 = prepare(0, f);
373 if (!b0)
374 return;
375 b1 = prepare(1, t);
376 if (!b1) {
377 FREE(file[0]);
378 Bterm(b0);
379 return;
380 }
381 if (binary){
382 /* could use b0 and b1 but this is simpler. */
383 if (cmp(b0, b1))
384 print("binary files %s %s differ\n", f, t);
385 Bterm(b0);
386 Bterm(b1);
387 return;
388 }
389 clen = 0;
390 prune();
391 sort(sfile[0], slen[0]);
392 sort(sfile[1], slen[1]);
393
394 member = (int *)file[1];
395 equiv(sfile[0], slen[0], sfile[1], slen[1], member);
396 member = REALLOC(member, int, slen[1]+2);
397
398 class = (int *)file[0];
399 unsort(sfile[0], slen[0], class);
400 class = REALLOC(class, int, slen[0]+2);
401
402 klist = MALLOC(int, slen[0]+2);
403 clist = MALLOC(struct cand, 1);
404 k = stone(class, slen[0], member, klist);
405 FREE(member);
406 FREE(class);
407
408 J = MALLOC(int, len[0]+2);
409 unravel(klist[k]);
410 FREE(clist);
411 FREE(klist);
412
413 ixold = MALLOC(long, len[0]+2);
414 ixnew = MALLOC(long, len[1]+2);
415 Bseek(b0, 0, 0); Bseek(b1, 0, 0);
416 check(b0, b1);
417 output();
418 FREE(J); FREE(ixold); FREE(ixnew);
419 Bterm(b0); Bterm(b1); /* ++++ */
420 }
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.