Introduction
Introduction Statistics Contact Development Disclaimer Help
gopher-clustering2.c - brcon2024-hackathons - Bitreichcon 2024 Hackathons
git clone git://bitreich.org/brcon2024-hackathons git://enlrupgkhuxnvlhsf6lc3fz…
Log
Files
Refs
Tags
Submodules
---
gopher-clustering2.c (24998B)
---
1 #define _POSIX_C_SOURCE 200112L
2
3 #include <assert.h>
4 #include <stdarg.h>
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9
10 #include <math.h>
11
12 #include <netdb.h>
13 #include <sys/socket.h>
14 #include <sys/types.h>
15 #include <unistd.h>
16
17 uint32_t
18 fnv1a(int n,...)
19 {
20 int i;
21 char *s;
22 va_list l;
23 uint32_t h;
24
25 h = 0x811c9dc5;
26
27 va_start(l, n);
28 for (i = 0; i < n; i++) {
29 for (s = va_arg(l, char*); *s; s++) {
30 h ^= *s;
31 h *= 0x01000193;
32 }
33 }
34 va_end(l);
35
36 return h;
37 }
38
39 uint32_t
40 xorshift(uint32_t *s)
41 {
42 *s ^= *s << 13;
43 *s ^= *s >> 17;
44 *s ^= *s << 5;
45 return *s;
46 }
47
48 /*
49 https://github.com/skeeto/hash-prospector/issues/19#issuecomment…
50 */
51 uint32_t
52 mix(uint32_t x)
53 {
54 x ^= x >> 16;
55 x *= 0x21f0aaad;
56 x ^= x >> 15;
57 x *= 0x735a2d97;
58 x ^= x >> 16;
59 return x;
60 }
61
62 struct gopherentry {
63 struct gopherentry *next;
64 char type;
65 char *fields[4];
66 size_t i, s;
67 };
68
69 char *
70 gopher_fieldend(char *s)
71 {
72 for (; *s; s++) {
73 if (*s == '\t')
74 break;
75 if (*s == '\r' && *(s+1) == '\n')
76 break;
77 if (*s == '\n')
78 break;
79 }
80 return s;
81 }
82
83 char *
84 gopher_nextentry(char *s)
85 {
86 char *c;
87
88 if (c = strchr(s, '\n'))
89 return c + 1;
90 return NULL;
91 }
92
93 void *
94 xmalloc(size_t s)
95 {
96 void *p;
97
98 if (!(p = malloc(s))) {
99 perror(NULL);
100 exit(1);
101 }
102 return p;
103 }
104
105 void *
106 xrealloc(void *p, size_t s)
107 {
108 if (!(p = realloc(p, s))) {
109 perror(NULL);
110 exit(1);
111 }
112 return p;
113 }
114
115 /* Allow a lot of ugly things... */
116 size_t
117 gopher_parsemenu(char *d, struct gopherentry **g)
118 {
119 ssize_t gl;
120 size_t fl, i, s;
121 char *c, *p, pt;
122 struct gopherentry *cg;
123
124 gl = 0;
125 cg = NULL;
126
127 s = 0;
128 pt = 0;
129 for (c = d; c && *c; c = gopher_nextentry(c)) {
130 if (*c == '.')
131 break;
132 if (*c == '\n')
133 continue;
134
135 gl++;
136 cg = xrealloc(cg, gl * sizeof(struct gopherentry));
137 memset(&cg[gl-1], 0, sizeof(struct gopherentry));
138
139 if (*c != 'i' && pt == 'i')
140 s++;
141 pt = *c;
142
143 cg[gl-1].type = *c;
144 cg[gl-1].i = gl-1;
145 cg[gl-1].s = s;
146 c++;
147
148 for (i = 0; i < 4; i++) {
149 p = gopher_fieldend(c);
150 fl = p - c;
151 cg[gl-1].fields[i] = xmalloc(fl + 1);
152 memcpy(cg[gl-1].fields[i], c, fl);
153 cg[gl-1].fields[i][fl] = '\0';
154 if (*p != '\t')
155 break;
156 c = p + 1;
157 }
158 }
159 s++;
160 *g = cg;
161 return gl;
162 }
163
164 ssize_t
165 gopher_removeentries(struct gopherentry *g, size_t l)
166 {
167 size_t i, j;
168
169 for (i = 0; i < l;) {
170 if (g[i].type == 'i' || g[i].type == '3' || !g[i].fields…
171 l--;
172 memmove(&g[i], &g[i+1], (l - i) * sizeof(*g));
173 continue;
174 }
175 for (j = 0; j < i; j++) {
176 if (!strcmp(g[i].fields[1], g[j].fields[1]) &&
177 !strcmp(g[i].fields[2], g[j].fields[2]) &&
178 !strcmp(g[i].fields[3], g[j].fields[3]))
179 break;
180 }
181 if (j < i) {
182 l--;
183 memmove(&g[i], &g[i+1], (l - i) * sizeof(*g));
184 continue;
185 }
186 i++;
187 }
188
189 return l;
190 }
191
192 ssize_t
193 gopher_request(char *host, char *port, char *selector, char *buffer, siz…
194 {
195 int fd;
196 struct addrinfo hints;
197 struct addrinfo *r, *rp;
198 ssize_t n, result, rn;
199
200 memset(&hints, 0, sizeof(hints));
201 hints.ai_family = AF_UNSPEC;
202 hints.ai_socktype = SOCK_STREAM;
203
204 if (getaddrinfo(host, port, &hints, &r) != 0)
205 return -1;
206
207 for (rp = r; rp; rp = rp->ai_next) {
208 if ((fd = socket(rp->ai_family, rp->ai_socktype, rp->ai_…
209 continue;
210 if (connect(fd, rp->ai_addr, rp->ai_addrlen) != -1)
211 break;
212 close(fd);
213 }
214 freeaddrinfo(r);
215 if (!rp)
216 return -1;
217
218 result = -1;
219 if (write(fd, selector, strlen(selector)) != strlen(selector))
220 goto cleanup;
221
222 if (write(fd, "\r\n", 2) != 2)
223 goto cleanup;
224
225 n = 0;
226 while (1) {
227 if (n + 512 > l - 1)
228 goto cleanup;
229 if ((rn = read(fd, &buffer[n], 512)) == -1)
230 goto cleanup;
231 else if (!rn)
232 break;
233 n += rn;
234 }
235
236 buffer[n] = '\0';
237 result = n;
238
239 cleanup:
240 close(fd);
241
242 return result;
243 }
244
245 size_t
246 triangularnumber(size_t n)
247 {
248 return (n * n + n) / 2;
249 }
250
251 size_t
252 distanceindex(size_t i, size_t j)
253 {
254 size_t t;
255
256 if (i < j) {
257 t = i;
258 i = j;
259 j = t;
260 }
261
262 return triangularnumber(i) + j;
263 }
264
265 struct point {
266 double dims[2];
267 };
268
269 #define SAMMON_MAPPING_MAX_ITER (1 << 13)
270 #define SAMMON_MAPPING_LR (0.01)
271 void
272 sammon(double *distances, size_t l, uint32_t prng, struct point *points)
273 {
274 size_t i, j, k, t, tmax;
275 double *gnarf;
276 double c, lr;
277 double e1, e2, sum, lasterror;
278 struct point *newpoints;
279
280 gnarf = calloc(triangularnumber(l), sizeof(*gnarf));
281 newpoints = calloc(l, sizeof(*newpoints));
282
283 for (i = 0; i < l; i++) {
284 points[i].dims[0] = (double)xorshift(&prng) / UINT32_MAX…
285 points[i].dims[1] = (double)xorshift(&prng) / UINT32_MAX…
286 }
287
288 c = 0;
289 for (i = 0; i < l; i++) {
290 gnarf[distanceindex(i, i)] = 0;
291 for (j = 0; j < i; j++) {
292 gnarf[distanceindex(i, j)] = sqrt(pow(points[i].…
293 c += distances[distanceindex(i, j)];
294 }
295 }
296
297 lasterror = 0;
298 for (t = 0, tmax = SAMMON_MAPPING_MAX_ITER; t < tmax; t++) {
299 lr = SAMMON_MAPPING_LR;
300 for (i = 0; i < l; i++) {
301 for (j = 0; j < 2; j++) {
302 e1 = -2. / c;
303 sum = 0;
304 for (k = 0; k < l; k++) {
305 if (i == k)
306 continue;
307 sum += (distances[distanceindex(…
308 }
309 e1 *= sum;
310
311 e2 = -2. / c;
312 sum = 0;
313 for (k = 0; k < l; k++) {
314 if (i == k)
315 continue;
316 sum += 1. / (distances[distancei…
317 }
318 e2 *= sum;
319
320 newpoints[i].dims[j] = points[i].dims[j]…
321 }
322 }
323
324 sum = 0;
325 for (i = 0; i < l; i++) {
326 for (j = 0; j < i; j++) {
327 gnarf[distanceindex(i, j)] = sqrt(pow(ne…
328 sum += pow(distances[distanceindex(i, j)…
329 }
330 }
331
332 fprintf(stderr, "sammon mapping error: %lf\n", sum / c);
333 if (t > 0 && fabs(sum / c - lasterror) < 0.000000000001)
334 break;
335 lasterror = sum / c;
336
337 memcpy(points, newpoints, l * sizeof(*points));
338 }
339
340 free(newpoints);
341 free(gnarf);
342 }
343
344 int
345 fastermsc(struct gopherentry *g, size_t l, size_t k, double *distances, …
346 {
347 size_t i, j, n, ck, grimmel;
348 ssize_t m1, m, m2;
349 size_t ti, th, huh;
350 double dsum, d, d2, gnarf, tdsum;
351 struct {
352 size_t medoid;
353 size_t n;
354 double dsum;
355 } *clusters;
356 struct {
357 size_t n1, n2, n3;
358 double d1, d2, d3;
359 } *cache;
360 double *dS, *dS2;
361 double cdS, cdS2;
362 size_t cm, cx;
363 ssize_t lx;
364 int flag, result;
365 uint32_t r;
366 double *C;
367
368 if (!l)
369 return 0;
370
371 if (l < k)
372 k = l;
373
374 clusters = calloc(k, sizeof(*clusters));
375 if (!clusters)
376 return -1;
377
378 result = -1;
379
380 /*
381 Seed the medoids randomly
382 */
383 for (i = n = 0; i < l; i++) {
384 if ((l - i) * (xorshift(&prng) / (double)UINT32_MAX) < k…
385 clusters[n++].medoid = i;
386 if (n == k)
387 break;
388 }
389
390 cache = calloc(l, sizeof(*cache));
391 if (!cache)
392 goto cleanup;
393
394 dS = calloc(k, sizeof(*dS));
395 if (!dS)
396 goto cleanup;
397
398 dS2 = calloc(k, sizeof(*dS2));
399 if (!dS2)
400 goto cleanup;
401
402 for (i = 0; i < l; i++) {
403 cache[i].d1 = cache[i].d2 = cache[i].d3 = INFINITY;
404 for (j = 0; j < k; j++) {
405 d = distances[distanceindex(i, clusters[j].medoi…
406 if (d < cache[i].d1) {
407 cache[i].d3 = cache[i].d2;
408 cache[i].d2 = cache[i].d1;
409 cache[i].d1 = d;
410 cache[i].n3 = cache[i].n2;
411 cache[i].n2 = cache[i].n1;
412 cache[i].n1 = j;
413 } else if (d < cache[i].d2) {
414 cache[i].d3 = cache[i].d2;
415 cache[i].d2 = d;
416 cache[i].n3 = cache[i].n2;
417 cache[i].n2 = j;
418 } else if (d < cache[i].d3) {
419 cache[i].d3 = d;
420 cache[i].n3 = j;
421 }
422 }
423 }
424
425 for (i = 0; i < k; i++) {
426 dS[i] = 0;
427 for (j = 0; j < l; j++) {
428 if (cache[j].n1 != i)
429 continue;
430 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d2…
431 }
432 for (j = 0; j < l; j++) {
433 if (cache[j].n2 != i)
434 continue;
435 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d1…
436 }
437 }
438
439 lx = -1;
440 for (grimmel = 0;grimmel < 32;grimmel++) {
441 cdS = 0;
442 for (i = 0; i < l; i++) {
443 if (i == lx)
444 goto fastermsc_finished;
445
446 for (j = 0; j < k; j++) {
447 if (clusters[j].medoid == i)
448 goto fastmsc_nexti;
449 }
450
451 memcpy(dS2, dS, k * sizeof(*dS2));
452 cdS2 = 0;
453 for (j = 0; j < l; j++) {
454 d = distances[distanceindex(i, j)];
455 if (d < cache[j].d1) {
456 cdS2 += cache[j].d1 / cache[j].d…
457 dS2[cache[j].n1] += d / cache[j]…
458 dS2[cache[j].n2] += cache[j].d1 …
459 } else if (d < cache[j].d2) {
460 cdS2 += cache[j].d1 / cache[j].d…
461 dS2[cache[j].n1] += cache[j].d1 …
462 dS2[cache[j].n2] += cache[j].d1 …
463 } else if (d < cache[j].d3) {
464 dS2[cache[j].n1] += cache[j].d2 …
465 dS2[cache[j].n2] += cache[j].d1 …
466 }
467 }
468 d = -INFINITY;
469 for (j = 0; j < k; j++) {
470 if (dS2[j] > d) {
471 d = dS2[j];
472 m = j;
473 }
474 }
475 dS2[m] += cdS2;
476 if (dS2[m] <= 0)
477 continue;
478 lx = i;
479 clusters[m].medoid = i;
480 for (j = 0; j < l; j++) {
481 if (cache[j].n1 != m && cache[j].n2 != m…
482 continue;
483 cache[j].d1 = cache[j].d2 = cache[j].d3 …
484 for (n = 0; n < k; n++) {
485 d = distances[distanceindex(j, c…
486 if (d < cache[j].d1) {
487 cache[j].d3 = cache[j].d…
488 cache[j].d2 = cache[j].d…
489 cache[j].d1 = d;
490 cache[j].n3 = cache[j].n…
491 cache[j].n2 = cache[j].n…
492 cache[j].n1 = n;
493 } else if (d < cache[j].d2) {
494 cache[j].d3 = cache[j].d…
495 cache[j].d2 = d;
496 cache[j].n3 = cache[j].n…
497 cache[j].n2 = n;
498 } else if (d < cache[j].d3) {
499 cache[j].d3 = d;
500 cache[j].n3 = n;
501 }
502 }
503 }
504 for (j = 0; j < k; j++) {
505 dS[j] = 0;
506 for (n = 0; n < l; n++) {
507 if (cache[n].n1 != j)
508 continue;
509 dS[j] += cache[n].d1 / cache[n].…
510 }
511 for (n = 0; n < l; n++) {
512 if (cache[n].n2 != j)
513 continue;
514 dS[j] += cache[n].d1 / cache[n].…
515 }
516 }
517 fastmsc_nexti:;
518 }
519 /*
520 if (lx == -1)
521 break;
522 */
523 }
524 fastermsc_finished:;
525
526 for (i = 0; i < k; i++)
527 clusters[i].n = 0;
528
529 for (i = 0; i < l; i++) {
530 d = INFINITY;
531 for (j = 0; j < k; j++) {
532 if (distances[distanceindex(clusters[j].medoid, …
533 assoc[i] = j;
534 d = distances[distanceindex(clusters[j].…
535 }
536 }
537 clusters[assoc[i]].n++;
538 }
539
540 for (i = 0; i < l; i++) {
541 if (clusters[assoc[i]].medoid == i)
542 assoc[i] |= 128;
543 }
544
545 result = k;
546 cleanup:
547 free(clusters);
548 return result;
549 }
550
551 int
552 fastmsc(struct gopherentry *g, size_t l, size_t k, double *distances, si…
553 {
554 size_t i, j, n, ck;
555 ssize_t m1, m, m2;
556 size_t ti, th;
557 double dsum, d, d2, gnarf, tdsum;
558 struct {
559 size_t medoid;
560 size_t n;
561 double dsum;
562 } *clusters;
563 struct {
564 size_t n1, n2;
565 double d1, d2, d3;
566 } *cache;
567 double *dS, *dS2;
568 double cdS, cdS2;
569 double _e, _y, _z, _s;
570 size_t cm, cx;
571 int flag, result;
572 uint32_t r;
573 double *C;
574
575 if (!l)
576 return 0;
577
578 if (l < k)
579 k = l;
580
581 clusters = calloc(k, sizeof(*clusters));
582 if (!clusters)
583 return -1;
584
585 result = -1;
586
587 /*
588 seed the medoids using PAM BUILD
589 d = INFINITY;
590 m = -1;
591 for (i = 0; i < l; i++) {
592 dsum = 0;
593 for (j = 0; j < l; j++)
594 dsum += distances[distanceindex(i, j)];
595 if (dsum < d) {
596 m = i;
597 d = dsum;
598 }
599 }
600 if (m == -1)
601 goto cleanup;
602 ck = 0;
603 clusters[ck].medoid = m;
604 ck++;
605
606 C = xmalloc(l * l * sizeof(*C));
607
608 for (; ck < k; ck++) {
609 memset(C, 0, l * l * sizeof(*C));
610 for (i = 0; i < l; i++) {
611 for (j = 0; j < ck; j++) {
612 if (clusters[j].medoid == i)
613 goto build_nexti;
614 }
615 for (j = 0; j < l; j++) {
616 if (i == j)
617 continue;
618 for (n = 0; n < ck; n++) {
619 if (clusters[n].medoid == j)
620 goto build_nextj;
621 }
622 d = INFINITY;
623 for (n = 0; n < ck; n++) {
624 if (distances[distanceindex(j, c…
625 d = distances[distancein…
626 }
627 C[j * l + i] = d - distances[distanceind…
628 if (C[j * l + i] < 0)
629 C[j * l + i] = 0;
630 build_nextj:;
631 }
632 build_nexti:;
633 }
634 d = -1;
635 m = -1;
636 for (i = 0; i < l; i++) {
637 for (j = 0; j < ck; j++) {
638 if (clusters[j].medoid == i)
639 goto build_nexti2;
640 }
641 dsum = 0;
642 for (j = 0; j < l; j++)
643 dsum += C[j * l + i];
644 if (dsum > d) {
645 d = dsum;
646 m = i;
647 }
648 build_nexti2:;
649 }
650 clusters[ck].medoid = m;
651 }
652 free(C);
653 */
654
655 /*
656 Seed the medoids randomly
657 */
658 for (i = n = 0; i < l; i++) {
659 if ((l - i) * (xorshift(&prng) / (double)UINT32_MAX) < k…
660 clusters[n++].medoid = i;
661 if (n == k)
662 break;
663 }
664
665 cache = calloc(l, sizeof(*cache));
666 if (!cache)
667 goto cleanup;
668
669 dS = calloc(k, sizeof(*dS));
670 if (!dS)
671 goto cleanup;
672
673 dS2 = calloc(k, sizeof(*dS2));
674 if (!dS2)
675 goto cleanup;
676
677 for (;;) {
678 for (i = 0; i < l; i++) {
679 cache[i].d1 = cache[i].d2 = cache[i].d3 = INFINI…
680 for (j = 0; j < k; j++) {
681 d = distances[distanceindex(i, clusters[…
682 if (d < cache[i].d1) {
683 cache[i].d3 = cache[i].d2;
684 cache[i].d2 = cache[i].d1;
685 cache[i].d1 = d;
686 cache[i].n2 = cache[i].n1;
687 cache[i].n1 = j;
688 } else if (d < cache[i].d2) {
689 cache[i].d3 = cache[i].d2;
690 cache[i].d2 = d;
691 cache[i].n2 = j;
692 } else if (d < cache[i].d3) {
693 cache[i].d3 = d;
694 }
695 }
696 }
697
698 for (i = 0; i < k; i++) {
699 dS[i] = 0;
700 for (j = 0; j < l; j++) {
701 if (cache[j].n1 != i)
702 continue;
703 dS[i] += cache[j].d1 / cache[j].d2 - cac…
704 }
705 for (j = 0; j < l; j++) {
706 if (cache[j].n2 != i)
707 continue;
708 dS[i] += cache[j].d1 / cache[j].d2 - cac…
709 }
710 }
711
712 cdS = 0;
713 for (i = 0; i < l; i++) {
714 for (j = 0; j < k; j++) {
715 if (clusters[j].medoid == i)
716 goto fastmsc_nexti;
717 }
718 memcpy(dS2, dS, k * sizeof(*dS2));
719 cdS2 = _e = 0;
720 for (j = 0; j < l; j++) {
721 d = distances[distanceindex(i, j)];
722 if (d < cache[j].d1) {
723 cdS2 += cache[j].d1 / cache[j].d…
724 dS2[cache[j].n1] += d / cache[j]…
725 dS2[cache[j].n2] += cache[j].d1 …
726 } else if (d < cache[j].d2) {
727 cdS2 += cache[j].d1 / cache[j].d…
728 dS2[cache[j].n1] += cache[j].d1 …
729 dS2[cache[j].n2] += cache[j].d1 …
730 } else if (d < cache[j].d3) {
731 dS2[cache[j].n1] += cache[j].d2 …
732 dS2[cache[j].n2] += cache[j].d1 …
733 }
734 }
735 d = -INFINITY;
736 d2 = INFINITY;
737 for (j = 0; j < k; j++) {
738 if (dS2[j] > d) {
739 d = dS2[j];
740 m = j;
741 }
742 if (dS2[j] < d2) {
743 d2 = dS2[j];
744 m2 = j;
745 }
746 }
747 dS2[m] += cdS2;
748 if (dS2[m] > cdS) {
749 cdS = dS2[m];
750 cm = m;
751 cx = i;
752 }
753 fastmsc_nexti:;
754 }
755 /* Should be 0. I guess some floating point problems? */
756 if (cdS <= 0.00000000000001)
757 break;
758 clusters[cm].medoid = cx;
759 }
760
761 for (i = 0; i < k; i++)
762 clusters[i].n = 0;
763
764 for (i = 0; i < l; i++) {
765 d = INFINITY;
766 for (j = 0; j < k; j++) {
767 if (distances[distanceindex(clusters[j].medoid, …
768 assoc[i] = j;
769 d = distances[distanceindex(clusters[j].…
770 }
771 }
772 clusters[assoc[i]].n++;
773 }
774
775 for (i = 0; i < l; i++) {
776 if (clusters[assoc[i]].medoid == i)
777 assoc[i] |= 128;
778 }
779
780 result = k;
781 cleanup:
782 free(clusters);
783 return result;
784 }
785
786 /*
787 Seeding of medoids based on the PAM BUILD description in https:/…
788 I was too stupid to understand other descriptions.
789 */
790 int
791 kmedoids(struct gopherentry *g, size_t l, size_t k, double *distances, s…
792 {
793 size_t i, j, n, ck;
794 ssize_t m1, m;
795 size_t ti, th;
796 double dsum, d, d2, gnarf, tdsum;
797 struct {
798 size_t medoid;
799 size_t n;
800 double dsum;
801 } *clusters;
802 int flag, result;
803 uint32_t r;
804 double *C;
805
806 if (!l)
807 return 0;
808
809 if (l < k)
810 k = l;
811
812 clusters = calloc(k, sizeof(*clusters));
813 if (!clusters)
814 return -1;
815
816 result = -1;
817
818 /* seed the medoids using PAM BUILD */
819 d = INFINITY;
820 m = -1;
821 for (i = 0; i < l; i++) {
822 dsum = 0;
823 for (j = 0; j < l; j++)
824 dsum += distances[distanceindex(i, j)];
825 if (dsum < d) {
826 m = i;
827 d = dsum;
828 }
829 }
830 if (m == -1)
831 goto cleanup;
832 ck = 0;
833 clusters[ck].medoid = m;
834 ck++;
835
836 C = xmalloc(l * l * sizeof(*C));
837
838 for (; ck < k; ck++) {
839 memset(C, 0, l * l * sizeof(*C));
840 for (i = 0; i < l; i++) {
841 for (j = 0; j < ck; j++) {
842 if (clusters[j].medoid == i)
843 goto build_nexti;
844 }
845 for (j = 0; j < l; j++) {
846 if (i == j)
847 continue;
848 for (n = 0; n < ck; n++) {
849 if (clusters[n].medoid == j)
850 goto build_nextj;
851 }
852 d = INFINITY;
853 for (n = 0; n < ck; n++) {
854 if (distances[distanceindex(j, c…
855 d = distances[distancein…
856 }
857 C[j * l + i] = d - distances[distanceind…
858 if (C[j * l + i] < 0)
859 C[j * l + i] = 0;
860 build_nextj:;
861 }
862 build_nexti:;
863 }
864 d = -1;
865 m = -1;
866 for (i = 0; i < l; i++) {
867 for (j = 0; j < ck; j++) {
868 if (clusters[j].medoid == i)
869 goto build_nexti2;
870 }
871 dsum = 0;
872 for (j = 0; j < l; j++)
873 dsum += C[j * l + i];
874 if (dsum > d) {
875 d = dsum;
876 m = i;
877 }
878 build_nexti2:;
879 }
880 clusters[ck].medoid = m;
881 }
882 free(C);
883
884 for (;;) {
885 tdsum = INFINITY;
886 for (i = 0; i < k; i++) {
887 for (j = 0; j < l; j++) {
888 for (n = 0; n < k; n++) {
889 if (j == clusters[n].medoid)
890 goto swap_nextj;
891 }
892 dsum = 0;
893 for (n = 0; n < l; n++) {
894 if (n == j)
895 continue;
896 for (m = 0; m < k; m++) {
897 if (clusters[m].medoid =…
898 goto swap_nextn;
899 }
900 d = INFINITY;
901 d2 = INFINITY;
902 for (m = 0; m < k; m++) {
903 if (distances[distancein…
904 d2 = d;
905 d = distances[di…
906 } else if (distances[dis…
907 d2 = distances[d…
908 }
909 }
910 if (distances[distanceindex(clus…
911 if (distances[distancein…
912 gnarf = 0;
913 else
914 gnarf = distance…
915 } else if (distances[distanceind…
916 if (distances[distancein…
917 gnarf = distance…
918 else
919 gnarf = d2 - d;
920 } else {
921 continue;
922 }
923 dsum += gnarf;
924 swap_nextn:;
925 }
926 if (dsum < tdsum) {
927 ti = i;
928 th = j;
929 tdsum = dsum;
930 }
931 swap_nextj:;
932 }
933 }
934 if (tdsum >= 0)
935 break;
936 clusters[ti].medoid = th;
937 }
938
939 for (i = 0; i < k; i++)
940 clusters[i].n = 0;
941
942 for (i = 0; i < l; i++) {
943 d = INFINITY;
944 for (j = 0; j < k; j++) {
945 if (distances[distanceindex(clusters[j].medoid, …
946 assoc[i] = j;
947 d = distances[distanceindex(clusters[j].…
948 }
949 }
950 clusters[assoc[i]].n++;
951 }
952
953 for (i = 0; i < l; i++) {
954 if (clusters[assoc[i]].medoid == i)
955 assoc[i] |= 128;
956 }
957
958 result = k;
959 cleanup:
960 free(clusters);
961 return result;
962 }
963
964 size_t
965 diff(size_t a, size_t b)
966 {
967 if (a < b)
968 return b - a;
969 return a - b;
970 }
971
972 /*
973 https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_wit…
974 Which references https://www.codeproject.com/Articles/13525/Fast…
975 The code there is using the CPOL: https://www.codeproject.com/in…
976 */
977 size_t
978 levenshteindistance(char *a, char *b)
979 {
980 size_t i, j, dc, ic, sc, r;
981 size_t *v0, *v1, *t;
982
983 v0 = xmalloc((strlen(b) + 1) * sizeof(*v0));
984 v1 = xmalloc((strlen(b) + 1) * sizeof(*v1));
985
986 for (i = 0; i <= strlen(b); i++)
987 v0[i] = i;
988
989 for (i = 0; i < strlen(a); i++) {
990 v1[0] = i + 1;
991 for (j = 0; j < strlen(b); j++) {
992 dc = v0[j + 1] + 1;
993 ic = v1[j] + 1;
994 if (a[i] == b[j])
995 sc = v0[j];
996 else
997 sc = v0[j] + 1;
998 if (dc < ic && dc < sc)
999 v1[j + 1] = dc;
1000 else if (ic < dc && ic < sc)
1001 v1[j + 1] = ic;
1002 else
1003 v1[j + 1] = sc;
1004 }
1005 t = v0;
1006 v0 = v1;
1007 v1 = t;
1008 }
1009
1010 r = v0[strlen(b)];
1011
1012 free(v1);
1013 free(v0);
1014
1015 return r;
1016 }
1017
1018 /*
1019 Remove common prefix and suffix. The pazz0distance is the sum of…
1020 */
1021 size_t
1022 pazz0distance(char *a, char *b)
1023 {
1024 size_t pl, sl;
1025 size_t al, bl;
1026
1027 al = strlen(a);
1028 bl = strlen(b);
1029
1030 for (pl = 0; pl < al && pl < bl && a[pl] == b[pl]; pl++);
1031 for (sl = 0; al - sl > pl && bl - sl > pl && a[al - 1 - sl] == b…
1032
1033 return al + bl - 2 * (pl + sl);
1034 }
1035
1036 /*
1037 https://en.wikipedia.org/wiki/Levenshtein_distance#Recursive
1038 */
1039 size_t
1040 levenshteindistance_slow(char *a, char *b)
1041 {
1042 size_t k, l, m;
1043
1044 if (!*a)
1045 return strlen(b);
1046 else if (!*b)
1047 return strlen(a);
1048 else if (*a == *b)
1049 return levenshteindistance_slow(a + 1, b + 1);
1050
1051 k = levenshteindistance_slow(a + 1, b);
1052 l = levenshteindistance_slow(a, b + 1);
1053 m = levenshteindistance_slow(a + 1, b + 1);
1054
1055 if (k < l && k < m)
1056 return 1 + k;
1057 else if (l < k && l < m)
1058 return 1 + l;
1059 else
1060 return 1 + m;
1061 }
1062
1063 ssize_t
1064 clustering(char *host, char *port, char *selector, struct gopherentry *g…
1065 {
1066 size_t i, j;
1067 ssize_t r;
1068 double *distances, d;
1069 double *mdists;
1070 struct {
1071 double selector;
1072 double segment;
1073 double index;
1074 } *distanceparts, maxdistanceparts;
1075 uint32_t prng;
1076 struct point *points;
1077
1078 distanceparts = xmalloc(triangularnumber(l) * sizeof(*distancepa…
1079
1080 /* So the sammon mapping can be happy. */
1081 prng = fnv1a(4, host, port, selector, "distancewiggle");
1082 memset(&maxdistanceparts, 0, sizeof(maxdistanceparts));
1083 for (i = 0; i < l; i++) {
1084 memset(&distanceparts[distanceindex(i, i)], 0, sizeof(*d…
1085 for (j = 0; j < i; j++) {
1086 distanceparts[distanceindex(i, j)].selector = le…
1087 distanceparts[distanceindex(i, j)].segment = dif…
1088 distanceparts[distanceindex(i, j)].index = diff(…
1089
1090 if (distanceparts[distanceindex(i, j)].selector …
1091 maxdistanceparts.selector = distancepart…
1092 if (distanceparts[distanceindex(i, j)].segment >…
1093 maxdistanceparts.segment = distanceparts…
1094 if (distanceparts[distanceindex(i, j)].index > m…
1095 maxdistanceparts.index = distanceparts[d…
1096 }
1097 }
1098 for (i = 0; i < l; i++) {
1099 for (j = 0; j < i; j++) {
1100 distanceparts[distanceindex(i, j)].selector /= m…
1101 distanceparts[distanceindex(i, j)].segment /= ma…
1102 distanceparts[distanceindex(i, j)].index /= maxd…
1103 }
1104 }
1105
1106 distances = xmalloc(triangularnumber(l) * sizeof(*distances));
1107 for (i = 0; i < l; i++) {
1108 distances[distanceindex(i, i)] = 0;
1109 for (j = 0; j < i; j++)
1110 distances[distanceindex(i, j)] = 0.75 * distance…
1111 }
1112 free(distanceparts);
1113
1114 r = kmedoids(g, l, k, distances, assocs, fnv1a(4, host, port, se…
1115
1116 points = calloc(l, sizeof(*points));
1117 sammon(distances, l, fnv1a(4, host, port, selector, "sammonseed"…
1118
1119 for (i = 0; i < l; i++)
1120 printf("%lf\t%lf\t%ld\t%lu\t%s\n", points[i].dims[0], po…
1121
1122 free(points);
1123 free(distances);
1124
1125 return r;
1126 }
1127
1128 char gopherbuffer[16 * 1024 * 1024];
1129
1130 int
1131 main(int argc, char *argv[])
1132 {
1133 ssize_t l;
1134 size_t k;
1135 struct gopherentry *g;
1136 char *host, *port, *selector;
1137 size_t *cassocs;
1138
1139 if (argc < 2) {
1140 printf("%s host [port [selector]]\n", argv[0]);
1141 return 1;
1142 }
1143
1144 host = argv[1];
1145 port = (argc > 2) ? argv[2] : "70";
1146 selector = (argc > 3) ? argv[3] : "";
1147
1148 l = gopher_request(host, port, selector, gopherbuffer, sizeof(go…
1149 if (l == -1)
1150 return 1;
1151
1152 l = gopher_parsemenu(gopherbuffer, &g);
1153 if (l == -1)
1154 return 1;
1155
1156 l = gopher_removeentries(g, l);
1157
1158 cassocs = calloc(l, sizeof(*cassocs));
1159 if (!cassocs)
1160 return 1;
1161 k = 5;
1162 clustering(host, port, selector, g, l, cassocs, k);
1163
1164 return 0;
1165 }
You are viewing proxied material from bitreich.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.