gopher-clustering2.c - brcon2024-hackathons - Bitreichcon 2024 Hackathons | |
git clone git://bitreich.org/brcon2024-hackathons git://enlrupgkhuxnvlhsf6lc3fz… | |
Log | |
Files | |
Refs | |
Tags | |
Submodules | |
--- | |
gopher-clustering2.c (24998B) | |
--- | |
1 #define _POSIX_C_SOURCE 200112L | |
2 | |
3 #include <assert.h> | |
4 #include <stdarg.h> | |
5 #include <stdint.h> | |
6 #include <stdio.h> | |
7 #include <stdlib.h> | |
8 #include <string.h> | |
9 | |
10 #include <math.h> | |
11 | |
12 #include <netdb.h> | |
13 #include <sys/socket.h> | |
14 #include <sys/types.h> | |
15 #include <unistd.h> | |
16 | |
17 uint32_t | |
18 fnv1a(int n,...) | |
19 { | |
20 int i; | |
21 char *s; | |
22 va_list l; | |
23 uint32_t h; | |
24 | |
25 h = 0x811c9dc5; | |
26 | |
27 va_start(l, n); | |
28 for (i = 0; i < n; i++) { | |
29 for (s = va_arg(l, char*); *s; s++) { | |
30 h ^= *s; | |
31 h *= 0x01000193; | |
32 } | |
33 } | |
34 va_end(l); | |
35 | |
36 return h; | |
37 } | |
38 | |
39 uint32_t | |
40 xorshift(uint32_t *s) | |
41 { | |
42 *s ^= *s << 13; | |
43 *s ^= *s >> 17; | |
44 *s ^= *s << 5; | |
45 return *s; | |
46 } | |
47 | |
48 /* | |
49 https://github.com/skeeto/hash-prospector/issues/19#issuecomment… | |
50 */ | |
51 uint32_t | |
52 mix(uint32_t x) | |
53 { | |
54 x ^= x >> 16; | |
55 x *= 0x21f0aaad; | |
56 x ^= x >> 15; | |
57 x *= 0x735a2d97; | |
58 x ^= x >> 16; | |
59 return x; | |
60 } | |
61 | |
62 struct gopherentry { | |
63 struct gopherentry *next; | |
64 char type; | |
65 char *fields[4]; | |
66 size_t i, s; | |
67 }; | |
68 | |
69 char * | |
70 gopher_fieldend(char *s) | |
71 { | |
72 for (; *s; s++) { | |
73 if (*s == '\t') | |
74 break; | |
75 if (*s == '\r' && *(s+1) == '\n') | |
76 break; | |
77 if (*s == '\n') | |
78 break; | |
79 } | |
80 return s; | |
81 } | |
82 | |
83 char * | |
84 gopher_nextentry(char *s) | |
85 { | |
86 char *c; | |
87 | |
88 if (c = strchr(s, '\n')) | |
89 return c + 1; | |
90 return NULL; | |
91 } | |
92 | |
93 void * | |
94 xmalloc(size_t s) | |
95 { | |
96 void *p; | |
97 | |
98 if (!(p = malloc(s))) { | |
99 perror(NULL); | |
100 exit(1); | |
101 } | |
102 return p; | |
103 } | |
104 | |
105 void * | |
106 xrealloc(void *p, size_t s) | |
107 { | |
108 if (!(p = realloc(p, s))) { | |
109 perror(NULL); | |
110 exit(1); | |
111 } | |
112 return p; | |
113 } | |
114 | |
115 /* Allow a lot of ugly things... */ | |
116 size_t | |
117 gopher_parsemenu(char *d, struct gopherentry **g) | |
118 { | |
119 ssize_t gl; | |
120 size_t fl, i, s; | |
121 char *c, *p, pt; | |
122 struct gopherentry *cg; | |
123 | |
124 gl = 0; | |
125 cg = NULL; | |
126 | |
127 s = 0; | |
128 pt = 0; | |
129 for (c = d; c && *c; c = gopher_nextentry(c)) { | |
130 if (*c == '.') | |
131 break; | |
132 if (*c == '\n') | |
133 continue; | |
134 | |
135 gl++; | |
136 cg = xrealloc(cg, gl * sizeof(struct gopherentry)); | |
137 memset(&cg[gl-1], 0, sizeof(struct gopherentry)); | |
138 | |
139 if (*c != 'i' && pt == 'i') | |
140 s++; | |
141 pt = *c; | |
142 | |
143 cg[gl-1].type = *c; | |
144 cg[gl-1].i = gl-1; | |
145 cg[gl-1].s = s; | |
146 c++; | |
147 | |
148 for (i = 0; i < 4; i++) { | |
149 p = gopher_fieldend(c); | |
150 fl = p - c; | |
151 cg[gl-1].fields[i] = xmalloc(fl + 1); | |
152 memcpy(cg[gl-1].fields[i], c, fl); | |
153 cg[gl-1].fields[i][fl] = '\0'; | |
154 if (*p != '\t') | |
155 break; | |
156 c = p + 1; | |
157 } | |
158 } | |
159 s++; | |
160 *g = cg; | |
161 return gl; | |
162 } | |
163 | |
164 ssize_t | |
165 gopher_removeentries(struct gopherentry *g, size_t l) | |
166 { | |
167 size_t i, j; | |
168 | |
169 for (i = 0; i < l;) { | |
170 if (g[i].type == 'i' || g[i].type == '3' || !g[i].fields… | |
171 l--; | |
172 memmove(&g[i], &g[i+1], (l - i) * sizeof(*g)); | |
173 continue; | |
174 } | |
175 for (j = 0; j < i; j++) { | |
176 if (!strcmp(g[i].fields[1], g[j].fields[1]) && | |
177 !strcmp(g[i].fields[2], g[j].fields[2]) && | |
178 !strcmp(g[i].fields[3], g[j].fields[3])) | |
179 break; | |
180 } | |
181 if (j < i) { | |
182 l--; | |
183 memmove(&g[i], &g[i+1], (l - i) * sizeof(*g)); | |
184 continue; | |
185 } | |
186 i++; | |
187 } | |
188 | |
189 return l; | |
190 } | |
191 | |
192 ssize_t | |
193 gopher_request(char *host, char *port, char *selector, char *buffer, siz… | |
194 { | |
195 int fd; | |
196 struct addrinfo hints; | |
197 struct addrinfo *r, *rp; | |
198 ssize_t n, result, rn; | |
199 | |
200 memset(&hints, 0, sizeof(hints)); | |
201 hints.ai_family = AF_UNSPEC; | |
202 hints.ai_socktype = SOCK_STREAM; | |
203 | |
204 if (getaddrinfo(host, port, &hints, &r) != 0) | |
205 return -1; | |
206 | |
207 for (rp = r; rp; rp = rp->ai_next) { | |
208 if ((fd = socket(rp->ai_family, rp->ai_socktype, rp->ai_… | |
209 continue; | |
210 if (connect(fd, rp->ai_addr, rp->ai_addrlen) != -1) | |
211 break; | |
212 close(fd); | |
213 } | |
214 freeaddrinfo(r); | |
215 if (!rp) | |
216 return -1; | |
217 | |
218 result = -1; | |
219 if (write(fd, selector, strlen(selector)) != strlen(selector)) | |
220 goto cleanup; | |
221 | |
222 if (write(fd, "\r\n", 2) != 2) | |
223 goto cleanup; | |
224 | |
225 n = 0; | |
226 while (1) { | |
227 if (n + 512 > l - 1) | |
228 goto cleanup; | |
229 if ((rn = read(fd, &buffer[n], 512)) == -1) | |
230 goto cleanup; | |
231 else if (!rn) | |
232 break; | |
233 n += rn; | |
234 } | |
235 | |
236 buffer[n] = '\0'; | |
237 result = n; | |
238 | |
239 cleanup: | |
240 close(fd); | |
241 | |
242 return result; | |
243 } | |
244 | |
245 size_t | |
246 triangularnumber(size_t n) | |
247 { | |
248 return (n * n + n) / 2; | |
249 } | |
250 | |
251 size_t | |
252 distanceindex(size_t i, size_t j) | |
253 { | |
254 size_t t; | |
255 | |
256 if (i < j) { | |
257 t = i; | |
258 i = j; | |
259 j = t; | |
260 } | |
261 | |
262 return triangularnumber(i) + j; | |
263 } | |
264 | |
265 struct point { | |
266 double dims[2]; | |
267 }; | |
268 | |
269 #define SAMMON_MAPPING_MAX_ITER (1 << 13) | |
270 #define SAMMON_MAPPING_LR (0.01) | |
271 void | |
272 sammon(double *distances, size_t l, uint32_t prng, struct point *points) | |
273 { | |
274 size_t i, j, k, t, tmax; | |
275 double *gnarf; | |
276 double c, lr; | |
277 double e1, e2, sum, lasterror; | |
278 struct point *newpoints; | |
279 | |
280 gnarf = calloc(triangularnumber(l), sizeof(*gnarf)); | |
281 newpoints = calloc(l, sizeof(*newpoints)); | |
282 | |
283 for (i = 0; i < l; i++) { | |
284 points[i].dims[0] = (double)xorshift(&prng) / UINT32_MAX… | |
285 points[i].dims[1] = (double)xorshift(&prng) / UINT32_MAX… | |
286 } | |
287 | |
288 c = 0; | |
289 for (i = 0; i < l; i++) { | |
290 gnarf[distanceindex(i, i)] = 0; | |
291 for (j = 0; j < i; j++) { | |
292 gnarf[distanceindex(i, j)] = sqrt(pow(points[i].… | |
293 c += distances[distanceindex(i, j)]; | |
294 } | |
295 } | |
296 | |
297 lasterror = 0; | |
298 for (t = 0, tmax = SAMMON_MAPPING_MAX_ITER; t < tmax; t++) { | |
299 lr = SAMMON_MAPPING_LR; | |
300 for (i = 0; i < l; i++) { | |
301 for (j = 0; j < 2; j++) { | |
302 e1 = -2. / c; | |
303 sum = 0; | |
304 for (k = 0; k < l; k++) { | |
305 if (i == k) | |
306 continue; | |
307 sum += (distances[distanceindex(… | |
308 } | |
309 e1 *= sum; | |
310 | |
311 e2 = -2. / c; | |
312 sum = 0; | |
313 for (k = 0; k < l; k++) { | |
314 if (i == k) | |
315 continue; | |
316 sum += 1. / (distances[distancei… | |
317 } | |
318 e2 *= sum; | |
319 | |
320 newpoints[i].dims[j] = points[i].dims[j]… | |
321 } | |
322 } | |
323 | |
324 sum = 0; | |
325 for (i = 0; i < l; i++) { | |
326 for (j = 0; j < i; j++) { | |
327 gnarf[distanceindex(i, j)] = sqrt(pow(ne… | |
328 sum += pow(distances[distanceindex(i, j)… | |
329 } | |
330 } | |
331 | |
332 fprintf(stderr, "sammon mapping error: %lf\n", sum / c); | |
333 if (t > 0 && fabs(sum / c - lasterror) < 0.000000000001) | |
334 break; | |
335 lasterror = sum / c; | |
336 | |
337 memcpy(points, newpoints, l * sizeof(*points)); | |
338 } | |
339 | |
340 free(newpoints); | |
341 free(gnarf); | |
342 } | |
343 | |
344 int | |
345 fastermsc(struct gopherentry *g, size_t l, size_t k, double *distances, … | |
346 { | |
347 size_t i, j, n, ck, grimmel; | |
348 ssize_t m1, m, m2; | |
349 size_t ti, th, huh; | |
350 double dsum, d, d2, gnarf, tdsum; | |
351 struct { | |
352 size_t medoid; | |
353 size_t n; | |
354 double dsum; | |
355 } *clusters; | |
356 struct { | |
357 size_t n1, n2, n3; | |
358 double d1, d2, d3; | |
359 } *cache; | |
360 double *dS, *dS2; | |
361 double cdS, cdS2; | |
362 size_t cm, cx; | |
363 ssize_t lx; | |
364 int flag, result; | |
365 uint32_t r; | |
366 double *C; | |
367 | |
368 if (!l) | |
369 return 0; | |
370 | |
371 if (l < k) | |
372 k = l; | |
373 | |
374 clusters = calloc(k, sizeof(*clusters)); | |
375 if (!clusters) | |
376 return -1; | |
377 | |
378 result = -1; | |
379 | |
380 /* | |
381 Seed the medoids randomly | |
382 */ | |
383 for (i = n = 0; i < l; i++) { | |
384 if ((l - i) * (xorshift(&prng) / (double)UINT32_MAX) < k… | |
385 clusters[n++].medoid = i; | |
386 if (n == k) | |
387 break; | |
388 } | |
389 | |
390 cache = calloc(l, sizeof(*cache)); | |
391 if (!cache) | |
392 goto cleanup; | |
393 | |
394 dS = calloc(k, sizeof(*dS)); | |
395 if (!dS) | |
396 goto cleanup; | |
397 | |
398 dS2 = calloc(k, sizeof(*dS2)); | |
399 if (!dS2) | |
400 goto cleanup; | |
401 | |
402 for (i = 0; i < l; i++) { | |
403 cache[i].d1 = cache[i].d2 = cache[i].d3 = INFINITY; | |
404 for (j = 0; j < k; j++) { | |
405 d = distances[distanceindex(i, clusters[j].medoi… | |
406 if (d < cache[i].d1) { | |
407 cache[i].d3 = cache[i].d2; | |
408 cache[i].d2 = cache[i].d1; | |
409 cache[i].d1 = d; | |
410 cache[i].n3 = cache[i].n2; | |
411 cache[i].n2 = cache[i].n1; | |
412 cache[i].n1 = j; | |
413 } else if (d < cache[i].d2) { | |
414 cache[i].d3 = cache[i].d2; | |
415 cache[i].d2 = d; | |
416 cache[i].n3 = cache[i].n2; | |
417 cache[i].n2 = j; | |
418 } else if (d < cache[i].d3) { | |
419 cache[i].d3 = d; | |
420 cache[i].n3 = j; | |
421 } | |
422 } | |
423 } | |
424 | |
425 for (i = 0; i < k; i++) { | |
426 dS[i] = 0; | |
427 for (j = 0; j < l; j++) { | |
428 if (cache[j].n1 != i) | |
429 continue; | |
430 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d2… | |
431 } | |
432 for (j = 0; j < l; j++) { | |
433 if (cache[j].n2 != i) | |
434 continue; | |
435 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d1… | |
436 } | |
437 } | |
438 | |
439 lx = -1; | |
440 for (grimmel = 0;grimmel < 32;grimmel++) { | |
441 cdS = 0; | |
442 for (i = 0; i < l; i++) { | |
443 if (i == lx) | |
444 goto fastermsc_finished; | |
445 | |
446 for (j = 0; j < k; j++) { | |
447 if (clusters[j].medoid == i) | |
448 goto fastmsc_nexti; | |
449 } | |
450 | |
451 memcpy(dS2, dS, k * sizeof(*dS2)); | |
452 cdS2 = 0; | |
453 for (j = 0; j < l; j++) { | |
454 d = distances[distanceindex(i, j)]; | |
455 if (d < cache[j].d1) { | |
456 cdS2 += cache[j].d1 / cache[j].d… | |
457 dS2[cache[j].n1] += d / cache[j]… | |
458 dS2[cache[j].n2] += cache[j].d1 … | |
459 } else if (d < cache[j].d2) { | |
460 cdS2 += cache[j].d1 / cache[j].d… | |
461 dS2[cache[j].n1] += cache[j].d1 … | |
462 dS2[cache[j].n2] += cache[j].d1 … | |
463 } else if (d < cache[j].d3) { | |
464 dS2[cache[j].n1] += cache[j].d2 … | |
465 dS2[cache[j].n2] += cache[j].d1 … | |
466 } | |
467 } | |
468 d = -INFINITY; | |
469 for (j = 0; j < k; j++) { | |
470 if (dS2[j] > d) { | |
471 d = dS2[j]; | |
472 m = j; | |
473 } | |
474 } | |
475 dS2[m] += cdS2; | |
476 if (dS2[m] <= 0) | |
477 continue; | |
478 lx = i; | |
479 clusters[m].medoid = i; | |
480 for (j = 0; j < l; j++) { | |
481 if (cache[j].n1 != m && cache[j].n2 != m… | |
482 continue; | |
483 cache[j].d1 = cache[j].d2 = cache[j].d3 … | |
484 for (n = 0; n < k; n++) { | |
485 d = distances[distanceindex(j, c… | |
486 if (d < cache[j].d1) { | |
487 cache[j].d3 = cache[j].d… | |
488 cache[j].d2 = cache[j].d… | |
489 cache[j].d1 = d; | |
490 cache[j].n3 = cache[j].n… | |
491 cache[j].n2 = cache[j].n… | |
492 cache[j].n1 = n; | |
493 } else if (d < cache[j].d2) { | |
494 cache[j].d3 = cache[j].d… | |
495 cache[j].d2 = d; | |
496 cache[j].n3 = cache[j].n… | |
497 cache[j].n2 = n; | |
498 } else if (d < cache[j].d3) { | |
499 cache[j].d3 = d; | |
500 cache[j].n3 = n; | |
501 } | |
502 } | |
503 } | |
504 for (j = 0; j < k; j++) { | |
505 dS[j] = 0; | |
506 for (n = 0; n < l; n++) { | |
507 if (cache[n].n1 != j) | |
508 continue; | |
509 dS[j] += cache[n].d1 / cache[n].… | |
510 } | |
511 for (n = 0; n < l; n++) { | |
512 if (cache[n].n2 != j) | |
513 continue; | |
514 dS[j] += cache[n].d1 / cache[n].… | |
515 } | |
516 } | |
517 fastmsc_nexti:; | |
518 } | |
519 /* | |
520 if (lx == -1) | |
521 break; | |
522 */ | |
523 } | |
524 fastermsc_finished:; | |
525 | |
526 for (i = 0; i < k; i++) | |
527 clusters[i].n = 0; | |
528 | |
529 for (i = 0; i < l; i++) { | |
530 d = INFINITY; | |
531 for (j = 0; j < k; j++) { | |
532 if (distances[distanceindex(clusters[j].medoid, … | |
533 assoc[i] = j; | |
534 d = distances[distanceindex(clusters[j].… | |
535 } | |
536 } | |
537 clusters[assoc[i]].n++; | |
538 } | |
539 | |
540 for (i = 0; i < l; i++) { | |
541 if (clusters[assoc[i]].medoid == i) | |
542 assoc[i] |= 128; | |
543 } | |
544 | |
545 result = k; | |
546 cleanup: | |
547 free(clusters); | |
548 return result; | |
549 } | |
550 | |
551 int | |
552 fastmsc(struct gopherentry *g, size_t l, size_t k, double *distances, si… | |
553 { | |
554 size_t i, j, n, ck; | |
555 ssize_t m1, m, m2; | |
556 size_t ti, th; | |
557 double dsum, d, d2, gnarf, tdsum; | |
558 struct { | |
559 size_t medoid; | |
560 size_t n; | |
561 double dsum; | |
562 } *clusters; | |
563 struct { | |
564 size_t n1, n2; | |
565 double d1, d2, d3; | |
566 } *cache; | |
567 double *dS, *dS2; | |
568 double cdS, cdS2; | |
569 double _e, _y, _z, _s; | |
570 size_t cm, cx; | |
571 int flag, result; | |
572 uint32_t r; | |
573 double *C; | |
574 | |
575 if (!l) | |
576 return 0; | |
577 | |
578 if (l < k) | |
579 k = l; | |
580 | |
581 clusters = calloc(k, sizeof(*clusters)); | |
582 if (!clusters) | |
583 return -1; | |
584 | |
585 result = -1; | |
586 | |
587 /* | |
588 seed the medoids using PAM BUILD | |
589 d = INFINITY; | |
590 m = -1; | |
591 for (i = 0; i < l; i++) { | |
592 dsum = 0; | |
593 for (j = 0; j < l; j++) | |
594 dsum += distances[distanceindex(i, j)]; | |
595 if (dsum < d) { | |
596 m = i; | |
597 d = dsum; | |
598 } | |
599 } | |
600 if (m == -1) | |
601 goto cleanup; | |
602 ck = 0; | |
603 clusters[ck].medoid = m; | |
604 ck++; | |
605 | |
606 C = xmalloc(l * l * sizeof(*C)); | |
607 | |
608 for (; ck < k; ck++) { | |
609 memset(C, 0, l * l * sizeof(*C)); | |
610 for (i = 0; i < l; i++) { | |
611 for (j = 0; j < ck; j++) { | |
612 if (clusters[j].medoid == i) | |
613 goto build_nexti; | |
614 } | |
615 for (j = 0; j < l; j++) { | |
616 if (i == j) | |
617 continue; | |
618 for (n = 0; n < ck; n++) { | |
619 if (clusters[n].medoid == j) | |
620 goto build_nextj; | |
621 } | |
622 d = INFINITY; | |
623 for (n = 0; n < ck; n++) { | |
624 if (distances[distanceindex(j, c… | |
625 d = distances[distancein… | |
626 } | |
627 C[j * l + i] = d - distances[distanceind… | |
628 if (C[j * l + i] < 0) | |
629 C[j * l + i] = 0; | |
630 build_nextj:; | |
631 } | |
632 build_nexti:; | |
633 } | |
634 d = -1; | |
635 m = -1; | |
636 for (i = 0; i < l; i++) { | |
637 for (j = 0; j < ck; j++) { | |
638 if (clusters[j].medoid == i) | |
639 goto build_nexti2; | |
640 } | |
641 dsum = 0; | |
642 for (j = 0; j < l; j++) | |
643 dsum += C[j * l + i]; | |
644 if (dsum > d) { | |
645 d = dsum; | |
646 m = i; | |
647 } | |
648 build_nexti2:; | |
649 } | |
650 clusters[ck].medoid = m; | |
651 } | |
652 free(C); | |
653 */ | |
654 | |
655 /* | |
656 Seed the medoids randomly | |
657 */ | |
658 for (i = n = 0; i < l; i++) { | |
659 if ((l - i) * (xorshift(&prng) / (double)UINT32_MAX) < k… | |
660 clusters[n++].medoid = i; | |
661 if (n == k) | |
662 break; | |
663 } | |
664 | |
665 cache = calloc(l, sizeof(*cache)); | |
666 if (!cache) | |
667 goto cleanup; | |
668 | |
669 dS = calloc(k, sizeof(*dS)); | |
670 if (!dS) | |
671 goto cleanup; | |
672 | |
673 dS2 = calloc(k, sizeof(*dS2)); | |
674 if (!dS2) | |
675 goto cleanup; | |
676 | |
677 for (;;) { | |
678 for (i = 0; i < l; i++) { | |
679 cache[i].d1 = cache[i].d2 = cache[i].d3 = INFINI… | |
680 for (j = 0; j < k; j++) { | |
681 d = distances[distanceindex(i, clusters[… | |
682 if (d < cache[i].d1) { | |
683 cache[i].d3 = cache[i].d2; | |
684 cache[i].d2 = cache[i].d1; | |
685 cache[i].d1 = d; | |
686 cache[i].n2 = cache[i].n1; | |
687 cache[i].n1 = j; | |
688 } else if (d < cache[i].d2) { | |
689 cache[i].d3 = cache[i].d2; | |
690 cache[i].d2 = d; | |
691 cache[i].n2 = j; | |
692 } else if (d < cache[i].d3) { | |
693 cache[i].d3 = d; | |
694 } | |
695 } | |
696 } | |
697 | |
698 for (i = 0; i < k; i++) { | |
699 dS[i] = 0; | |
700 for (j = 0; j < l; j++) { | |
701 if (cache[j].n1 != i) | |
702 continue; | |
703 dS[i] += cache[j].d1 / cache[j].d2 - cac… | |
704 } | |
705 for (j = 0; j < l; j++) { | |
706 if (cache[j].n2 != i) | |
707 continue; | |
708 dS[i] += cache[j].d1 / cache[j].d2 - cac… | |
709 } | |
710 } | |
711 | |
712 cdS = 0; | |
713 for (i = 0; i < l; i++) { | |
714 for (j = 0; j < k; j++) { | |
715 if (clusters[j].medoid == i) | |
716 goto fastmsc_nexti; | |
717 } | |
718 memcpy(dS2, dS, k * sizeof(*dS2)); | |
719 cdS2 = _e = 0; | |
720 for (j = 0; j < l; j++) { | |
721 d = distances[distanceindex(i, j)]; | |
722 if (d < cache[j].d1) { | |
723 cdS2 += cache[j].d1 / cache[j].d… | |
724 dS2[cache[j].n1] += d / cache[j]… | |
725 dS2[cache[j].n2] += cache[j].d1 … | |
726 } else if (d < cache[j].d2) { | |
727 cdS2 += cache[j].d1 / cache[j].d… | |
728 dS2[cache[j].n1] += cache[j].d1 … | |
729 dS2[cache[j].n2] += cache[j].d1 … | |
730 } else if (d < cache[j].d3) { | |
731 dS2[cache[j].n1] += cache[j].d2 … | |
732 dS2[cache[j].n2] += cache[j].d1 … | |
733 } | |
734 } | |
735 d = -INFINITY; | |
736 d2 = INFINITY; | |
737 for (j = 0; j < k; j++) { | |
738 if (dS2[j] > d) { | |
739 d = dS2[j]; | |
740 m = j; | |
741 } | |
742 if (dS2[j] < d2) { | |
743 d2 = dS2[j]; | |
744 m2 = j; | |
745 } | |
746 } | |
747 dS2[m] += cdS2; | |
748 if (dS2[m] > cdS) { | |
749 cdS = dS2[m]; | |
750 cm = m; | |
751 cx = i; | |
752 } | |
753 fastmsc_nexti:; | |
754 } | |
755 /* Should be 0. I guess some floating point problems? */ | |
756 if (cdS <= 0.00000000000001) | |
757 break; | |
758 clusters[cm].medoid = cx; | |
759 } | |
760 | |
761 for (i = 0; i < k; i++) | |
762 clusters[i].n = 0; | |
763 | |
764 for (i = 0; i < l; i++) { | |
765 d = INFINITY; | |
766 for (j = 0; j < k; j++) { | |
767 if (distances[distanceindex(clusters[j].medoid, … | |
768 assoc[i] = j; | |
769 d = distances[distanceindex(clusters[j].… | |
770 } | |
771 } | |
772 clusters[assoc[i]].n++; | |
773 } | |
774 | |
775 for (i = 0; i < l; i++) { | |
776 if (clusters[assoc[i]].medoid == i) | |
777 assoc[i] |= 128; | |
778 } | |
779 | |
780 result = k; | |
781 cleanup: | |
782 free(clusters); | |
783 return result; | |
784 } | |
785 | |
786 /* | |
787 Seeding of medoids based on the PAM BUILD description in https:/… | |
788 I was too stupid to understand other descriptions. | |
789 */ | |
790 int | |
791 kmedoids(struct gopherentry *g, size_t l, size_t k, double *distances, s… | |
792 { | |
793 size_t i, j, n, ck; | |
794 ssize_t m1, m; | |
795 size_t ti, th; | |
796 double dsum, d, d2, gnarf, tdsum; | |
797 struct { | |
798 size_t medoid; | |
799 size_t n; | |
800 double dsum; | |
801 } *clusters; | |
802 int flag, result; | |
803 uint32_t r; | |
804 double *C; | |
805 | |
806 if (!l) | |
807 return 0; | |
808 | |
809 if (l < k) | |
810 k = l; | |
811 | |
812 clusters = calloc(k, sizeof(*clusters)); | |
813 if (!clusters) | |
814 return -1; | |
815 | |
816 result = -1; | |
817 | |
818 /* seed the medoids using PAM BUILD */ | |
819 d = INFINITY; | |
820 m = -1; | |
821 for (i = 0; i < l; i++) { | |
822 dsum = 0; | |
823 for (j = 0; j < l; j++) | |
824 dsum += distances[distanceindex(i, j)]; | |
825 if (dsum < d) { | |
826 m = i; | |
827 d = dsum; | |
828 } | |
829 } | |
830 if (m == -1) | |
831 goto cleanup; | |
832 ck = 0; | |
833 clusters[ck].medoid = m; | |
834 ck++; | |
835 | |
836 C = xmalloc(l * l * sizeof(*C)); | |
837 | |
838 for (; ck < k; ck++) { | |
839 memset(C, 0, l * l * sizeof(*C)); | |
840 for (i = 0; i < l; i++) { | |
841 for (j = 0; j < ck; j++) { | |
842 if (clusters[j].medoid == i) | |
843 goto build_nexti; | |
844 } | |
845 for (j = 0; j < l; j++) { | |
846 if (i == j) | |
847 continue; | |
848 for (n = 0; n < ck; n++) { | |
849 if (clusters[n].medoid == j) | |
850 goto build_nextj; | |
851 } | |
852 d = INFINITY; | |
853 for (n = 0; n < ck; n++) { | |
854 if (distances[distanceindex(j, c… | |
855 d = distances[distancein… | |
856 } | |
857 C[j * l + i] = d - distances[distanceind… | |
858 if (C[j * l + i] < 0) | |
859 C[j * l + i] = 0; | |
860 build_nextj:; | |
861 } | |
862 build_nexti:; | |
863 } | |
864 d = -1; | |
865 m = -1; | |
866 for (i = 0; i < l; i++) { | |
867 for (j = 0; j < ck; j++) { | |
868 if (clusters[j].medoid == i) | |
869 goto build_nexti2; | |
870 } | |
871 dsum = 0; | |
872 for (j = 0; j < l; j++) | |
873 dsum += C[j * l + i]; | |
874 if (dsum > d) { | |
875 d = dsum; | |
876 m = i; | |
877 } | |
878 build_nexti2:; | |
879 } | |
880 clusters[ck].medoid = m; | |
881 } | |
882 free(C); | |
883 | |
884 for (;;) { | |
885 tdsum = INFINITY; | |
886 for (i = 0; i < k; i++) { | |
887 for (j = 0; j < l; j++) { | |
888 for (n = 0; n < k; n++) { | |
889 if (j == clusters[n].medoid) | |
890 goto swap_nextj; | |
891 } | |
892 dsum = 0; | |
893 for (n = 0; n < l; n++) { | |
894 if (n == j) | |
895 continue; | |
896 for (m = 0; m < k; m++) { | |
897 if (clusters[m].medoid =… | |
898 goto swap_nextn; | |
899 } | |
900 d = INFINITY; | |
901 d2 = INFINITY; | |
902 for (m = 0; m < k; m++) { | |
903 if (distances[distancein… | |
904 d2 = d; | |
905 d = distances[di… | |
906 } else if (distances[dis… | |
907 d2 = distances[d… | |
908 } | |
909 } | |
910 if (distances[distanceindex(clus… | |
911 if (distances[distancein… | |
912 gnarf = 0; | |
913 else | |
914 gnarf = distance… | |
915 } else if (distances[distanceind… | |
916 if (distances[distancein… | |
917 gnarf = distance… | |
918 else | |
919 gnarf = d2 - d; | |
920 } else { | |
921 continue; | |
922 } | |
923 dsum += gnarf; | |
924 swap_nextn:; | |
925 } | |
926 if (dsum < tdsum) { | |
927 ti = i; | |
928 th = j; | |
929 tdsum = dsum; | |
930 } | |
931 swap_nextj:; | |
932 } | |
933 } | |
934 if (tdsum >= 0) | |
935 break; | |
936 clusters[ti].medoid = th; | |
937 } | |
938 | |
939 for (i = 0; i < k; i++) | |
940 clusters[i].n = 0; | |
941 | |
942 for (i = 0; i < l; i++) { | |
943 d = INFINITY; | |
944 for (j = 0; j < k; j++) { | |
945 if (distances[distanceindex(clusters[j].medoid, … | |
946 assoc[i] = j; | |
947 d = distances[distanceindex(clusters[j].… | |
948 } | |
949 } | |
950 clusters[assoc[i]].n++; | |
951 } | |
952 | |
953 for (i = 0; i < l; i++) { | |
954 if (clusters[assoc[i]].medoid == i) | |
955 assoc[i] |= 128; | |
956 } | |
957 | |
958 result = k; | |
959 cleanup: | |
960 free(clusters); | |
961 return result; | |
962 } | |
963 | |
964 size_t | |
965 diff(size_t a, size_t b) | |
966 { | |
967 if (a < b) | |
968 return b - a; | |
969 return a - b; | |
970 } | |
971 | |
972 /* | |
973 https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_wit… | |
974 Which references https://www.codeproject.com/Articles/13525/Fast… | |
975 The code there is using the CPOL: https://www.codeproject.com/in… | |
976 */ | |
977 size_t | |
978 levenshteindistance(char *a, char *b) | |
979 { | |
980 size_t i, j, dc, ic, sc, r; | |
981 size_t *v0, *v1, *t; | |
982 | |
983 v0 = xmalloc((strlen(b) + 1) * sizeof(*v0)); | |
984 v1 = xmalloc((strlen(b) + 1) * sizeof(*v1)); | |
985 | |
986 for (i = 0; i <= strlen(b); i++) | |
987 v0[i] = i; | |
988 | |
989 for (i = 0; i < strlen(a); i++) { | |
990 v1[0] = i + 1; | |
991 for (j = 0; j < strlen(b); j++) { | |
992 dc = v0[j + 1] + 1; | |
993 ic = v1[j] + 1; | |
994 if (a[i] == b[j]) | |
995 sc = v0[j]; | |
996 else | |
997 sc = v0[j] + 1; | |
998 if (dc < ic && dc < sc) | |
999 v1[j + 1] = dc; | |
1000 else if (ic < dc && ic < sc) | |
1001 v1[j + 1] = ic; | |
1002 else | |
1003 v1[j + 1] = sc; | |
1004 } | |
1005 t = v0; | |
1006 v0 = v1; | |
1007 v1 = t; | |
1008 } | |
1009 | |
1010 r = v0[strlen(b)]; | |
1011 | |
1012 free(v1); | |
1013 free(v0); | |
1014 | |
1015 return r; | |
1016 } | |
1017 | |
1018 /* | |
1019 Remove common prefix and suffix. The pazz0distance is the sum of… | |
1020 */ | |
1021 size_t | |
1022 pazz0distance(char *a, char *b) | |
1023 { | |
1024 size_t pl, sl; | |
1025 size_t al, bl; | |
1026 | |
1027 al = strlen(a); | |
1028 bl = strlen(b); | |
1029 | |
1030 for (pl = 0; pl < al && pl < bl && a[pl] == b[pl]; pl++); | |
1031 for (sl = 0; al - sl > pl && bl - sl > pl && a[al - 1 - sl] == b… | |
1032 | |
1033 return al + bl - 2 * (pl + sl); | |
1034 } | |
1035 | |
1036 /* | |
1037 https://en.wikipedia.org/wiki/Levenshtein_distance#Recursive | |
1038 */ | |
1039 size_t | |
1040 levenshteindistance_slow(char *a, char *b) | |
1041 { | |
1042 size_t k, l, m; | |
1043 | |
1044 if (!*a) | |
1045 return strlen(b); | |
1046 else if (!*b) | |
1047 return strlen(a); | |
1048 else if (*a == *b) | |
1049 return levenshteindistance_slow(a + 1, b + 1); | |
1050 | |
1051 k = levenshteindistance_slow(a + 1, b); | |
1052 l = levenshteindistance_slow(a, b + 1); | |
1053 m = levenshteindistance_slow(a + 1, b + 1); | |
1054 | |
1055 if (k < l && k < m) | |
1056 return 1 + k; | |
1057 else if (l < k && l < m) | |
1058 return 1 + l; | |
1059 else | |
1060 return 1 + m; | |
1061 } | |
1062 | |
1063 ssize_t | |
1064 clustering(char *host, char *port, char *selector, struct gopherentry *g… | |
1065 { | |
1066 size_t i, j; | |
1067 ssize_t r; | |
1068 double *distances, d; | |
1069 double *mdists; | |
1070 struct { | |
1071 double selector; | |
1072 double segment; | |
1073 double index; | |
1074 } *distanceparts, maxdistanceparts; | |
1075 uint32_t prng; | |
1076 struct point *points; | |
1077 | |
1078 distanceparts = xmalloc(triangularnumber(l) * sizeof(*distancepa… | |
1079 | |
1080 /* So the sammon mapping can be happy. */ | |
1081 prng = fnv1a(4, host, port, selector, "distancewiggle"); | |
1082 memset(&maxdistanceparts, 0, sizeof(maxdistanceparts)); | |
1083 for (i = 0; i < l; i++) { | |
1084 memset(&distanceparts[distanceindex(i, i)], 0, sizeof(*d… | |
1085 for (j = 0; j < i; j++) { | |
1086 distanceparts[distanceindex(i, j)].selector = le… | |
1087 distanceparts[distanceindex(i, j)].segment = dif… | |
1088 distanceparts[distanceindex(i, j)].index = diff(… | |
1089 | |
1090 if (distanceparts[distanceindex(i, j)].selector … | |
1091 maxdistanceparts.selector = distancepart… | |
1092 if (distanceparts[distanceindex(i, j)].segment >… | |
1093 maxdistanceparts.segment = distanceparts… | |
1094 if (distanceparts[distanceindex(i, j)].index > m… | |
1095 maxdistanceparts.index = distanceparts[d… | |
1096 } | |
1097 } | |
1098 for (i = 0; i < l; i++) { | |
1099 for (j = 0; j < i; j++) { | |
1100 distanceparts[distanceindex(i, j)].selector /= m… | |
1101 distanceparts[distanceindex(i, j)].segment /= ma… | |
1102 distanceparts[distanceindex(i, j)].index /= maxd… | |
1103 } | |
1104 } | |
1105 | |
1106 distances = xmalloc(triangularnumber(l) * sizeof(*distances)); | |
1107 for (i = 0; i < l; i++) { | |
1108 distances[distanceindex(i, i)] = 0; | |
1109 for (j = 0; j < i; j++) | |
1110 distances[distanceindex(i, j)] = 0.75 * distance… | |
1111 } | |
1112 free(distanceparts); | |
1113 | |
1114 r = kmedoids(g, l, k, distances, assocs, fnv1a(4, host, port, se… | |
1115 | |
1116 points = calloc(l, sizeof(*points)); | |
1117 sammon(distances, l, fnv1a(4, host, port, selector, "sammonseed"… | |
1118 | |
1119 for (i = 0; i < l; i++) | |
1120 printf("%lf\t%lf\t%ld\t%lu\t%s\n", points[i].dims[0], po… | |
1121 | |
1122 free(points); | |
1123 free(distances); | |
1124 | |
1125 return r; | |
1126 } | |
1127 | |
1128 char gopherbuffer[16 * 1024 * 1024]; | |
1129 | |
1130 int | |
1131 main(int argc, char *argv[]) | |
1132 { | |
1133 ssize_t l; | |
1134 size_t k; | |
1135 struct gopherentry *g; | |
1136 char *host, *port, *selector; | |
1137 size_t *cassocs; | |
1138 | |
1139 if (argc < 2) { | |
1140 printf("%s host [port [selector]]\n", argv[0]); | |
1141 return 1; | |
1142 } | |
1143 | |
1144 host = argv[1]; | |
1145 port = (argc > 2) ? argv[2] : "70"; | |
1146 selector = (argc > 3) ? argv[3] : ""; | |
1147 | |
1148 l = gopher_request(host, port, selector, gopherbuffer, sizeof(go… | |
1149 if (l == -1) | |
1150 return 1; | |
1151 | |
1152 l = gopher_parsemenu(gopherbuffer, &g); | |
1153 if (l == -1) | |
1154 return 1; | |
1155 | |
1156 l = gopher_removeentries(g, l); | |
1157 | |
1158 cassocs = calloc(l, sizeof(*cassocs)); | |
1159 if (!cassocs) | |
1160 return 1; | |
1161 k = 5; | |
1162 clustering(host, port, selector, g, l, cassocs, k); | |
1163 | |
1164 return 0; | |
1165 } |