| n8.c - 9base - revived minimalist port of Plan 9 userland to Unix | |
| git clone git://git.suckless.org/9base | |
| Log | |
| Files | |
| Refs | |
| README | |
| LICENSE | |
| --- | |
| n8.c (10320B) | |
| --- | |
| 1 #include <u.h> | |
| 2 #include "tdef.h" | |
| 3 #include "fns.h" | |
| 4 #include "ext.h" | |
| 5 | |
| 6 #define HY_BIT 0200 /* stuff in here only works for… | |
| 7 /* this value is used (as a literal) in suftab.c… | |
| 8 /* to encode possible hyphenation points in suff… | |
| 9 /* it could be changed, by widening the tables */ | |
| 10 /* to be shorts instead of chars. */ | |
| 11 | |
| 12 /* | |
| 13 * troff8.c | |
| 14 * | |
| 15 * hyphenation | |
| 16 */ | |
| 17 | |
| 18 int hexsize = 0; /* hyphenation exception list siz… | |
| 19 char *hbufp = NULL; /* base of list */ | |
| 20 char *nexth = NULL; /* first free slot in list */ | |
| 21 Tchar *hyend; | |
| 22 | |
| 23 #define THRESH 160 /* digram goodness threshold */ | |
| 24 int thresh = THRESH; | |
| 25 | |
| 26 int texhyphen(void); | |
| 27 static int alpha(Tchar); | |
| 28 | |
| 29 void hyphen(Tchar *wp) | |
| 30 { | |
| 31 int j; | |
| 32 Tchar *i; | |
| 33 | |
| 34 i = wp; | |
| 35 while (punct((*i++))) | |
| 36 ; | |
| 37 if (!alpha(*--i)) | |
| 38 return; | |
| 39 wdstart = i++; | |
| 40 while (alpha(*i++)) | |
| 41 ; | |
| 42 hyend = wdend = --i - 1; | |
| 43 while (punct((*i++))) | |
| 44 ; | |
| 45 if (*--i) | |
| 46 return; | |
| 47 if (wdend - wdstart < 4) /* 4 chars is too short to hyphe… | |
| 48 return; | |
| 49 hyp = hyptr; | |
| 50 *hyp = 0; | |
| 51 hyoff = 2; | |
| 52 | |
| 53 /* for now, try exceptions first, then tex (if hyphalg is non-ze… | |
| 54 then suffix and digram if tex didn't hyphenate it at all. | |
| 55 */ | |
| 56 | |
| 57 if (!exword() && !texhyphen() && !suffix()) | |
| 58 digram(); | |
| 59 | |
| 60 /* this appears to sort hyphenation points into increasing order… | |
| 61 *hyp++ = 0; | |
| 62 if (*hyptr) | |
| 63 for (j = 1; j; ) { | |
| 64 j = 0; | |
| 65 for (hyp = hyptr + 1; *hyp != 0; hyp++) { | |
| 66 if (*(hyp - 1) > *hyp) { | |
| 67 j++; | |
| 68 i = *hyp; | |
| 69 *hyp = *(hyp - 1); | |
| 70 *(hyp - 1) = i; | |
| 71 } | |
| 72 } | |
| 73 } | |
| 74 } | |
| 75 | |
| 76 static int alpha(Tchar i) /* non-zero if really alphabetic */ | |
| 77 { | |
| 78 if (ismot(i)) | |
| 79 return 0; | |
| 80 else if (cbits(i) >= ALPHABET) /* this isn't very elegant… | |
| 81 return 0; /* no good way to make sure i i… | |
| 82 else /* the call of isalpha */ | |
| 83 return isalpha(cbits(i)); | |
| 84 } | |
| 85 | |
| 86 int | |
| 87 punct(Tchar i) | |
| 88 { | |
| 89 if (!i || alpha(i)) | |
| 90 return(0); | |
| 91 else | |
| 92 return(1); | |
| 93 } | |
| 94 | |
| 95 | |
| 96 void caseha(void) /* set hyphenation algorithm */ | |
| 97 { | |
| 98 hyphalg = HYPHALG; | |
| 99 if (skip()) | |
| 100 return; | |
| 101 noscale++; | |
| 102 hyphalg = atoi0(); | |
| 103 noscale = 0; | |
| 104 } | |
| 105 | |
| 106 | |
| 107 void caseht(void) /* set hyphenation threshold; not in manual! */ | |
| 108 { | |
| 109 thresh = THRESH; | |
| 110 if (skip()) | |
| 111 return; | |
| 112 noscale++; | |
| 113 thresh = atoi0(); | |
| 114 noscale = 0; | |
| 115 } | |
| 116 | |
| 117 | |
| 118 char *growh(char *where) | |
| 119 { | |
| 120 char *new; | |
| 121 | |
| 122 hexsize += NHEX; | |
| 123 if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL) | |
| 124 return NULL; | |
| 125 if (new == hbufp) { | |
| 126 return where; | |
| 127 } else { | |
| 128 int diff; | |
| 129 diff = where - hbufp; | |
| 130 hbufp = new; | |
| 131 return new + diff; | |
| 132 } | |
| 133 } | |
| 134 | |
| 135 | |
| 136 void casehw(void) | |
| 137 { | |
| 138 int i, k; | |
| 139 char *j; | |
| 140 Tchar t; | |
| 141 | |
| 142 if (nexth == NULL) { | |
| 143 if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) ==… | |
| 144 ERROR "No space for exception word list." WARN; | |
| 145 return; | |
| 146 } | |
| 147 hexsize = NHEX; | |
| 148 } | |
| 149 k = 0; | |
| 150 while (!skip()) { | |
| 151 if ((j = nexth) >= hbufp + hexsize - 2) | |
| 152 if ((j = nexth = growh(j)) == NULL) | |
| 153 goto full; | |
| 154 for (;;) { | |
| 155 if (ismot(t = getch())) | |
| 156 continue; | |
| 157 i = cbits(t); | |
| 158 if (i == ' ' || i == '\n') { | |
| 159 *j++ = 0; | |
| 160 nexth = j; | |
| 161 *j = 0; | |
| 162 if (i == ' ') | |
| 163 break; | |
| 164 else | |
| 165 return; | |
| 166 } | |
| 167 if (i == '-') { | |
| 168 k = HY_BIT; | |
| 169 continue; | |
| 170 } | |
| 171 *j++ = maplow(i) | k; | |
| 172 k = 0; | |
| 173 if (j >= hbufp + hexsize - 2) | |
| 174 if ((j = growh(j)) == NULL) | |
| 175 goto full; | |
| 176 } | |
| 177 } | |
| 178 return; | |
| 179 full: | |
| 180 ERROR "Cannot grow exception word list." WARN; | |
| 181 *nexth = 0; | |
| 182 } | |
| 183 | |
| 184 | |
| 185 int exword(void) | |
| 186 { | |
| 187 Tchar *w; | |
| 188 char *e, *save; | |
| 189 | |
| 190 e = hbufp; | |
| 191 while (1) { | |
| 192 save = e; | |
| 193 if (e == NULL || *e == 0) | |
| 194 return(0); | |
| 195 w = wdstart; | |
| 196 while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*… | |
| 197 e++; | |
| 198 w++; | |
| 199 } | |
| 200 if (!*e) { | |
| 201 if (w-1 == hyend || (w == wdend && maplow(cbits(… | |
| 202 w = wdstart; | |
| 203 for (e = save; *e; e++) { | |
| 204 if (*e & HY_BIT) | |
| 205 *hyp++ = w; | |
| 206 if (hyp > hyptr + NHYP - 1) | |
| 207 hyp = hyptr + NHYP - 1; | |
| 208 w++; | |
| 209 } | |
| 210 return(1); | |
| 211 } else { | |
| 212 e++; | |
| 213 continue; | |
| 214 } | |
| 215 } else | |
| 216 while (*e++) | |
| 217 ; | |
| 218 } | |
| 219 } | |
| 220 | |
| 221 int | |
| 222 suffix(void) | |
| 223 { | |
| 224 Tchar *w; | |
| 225 char *s, *s0; | |
| 226 Tchar i; | |
| 227 extern char *suftab[]; | |
| 228 | |
| 229 again: | |
| 230 i = cbits(*hyend); | |
| 231 if (!alpha(i)) | |
| 232 return(0); | |
| 233 if (i < 'a') | |
| 234 i -= 'A' - 'a'; | |
| 235 if ((s0 = suftab[i-'a']) == 0) | |
| 236 return(0); | |
| 237 for (;;) { | |
| 238 if ((i = *s0 & 017) == 0) | |
| 239 return(0); | |
| 240 s = s0 + i - 1; | |
| 241 w = hyend - 1; | |
| 242 while (s > s0 && w >= wdstart && (*s & 0177) == maplow(c… | |
| 243 s--; | |
| 244 w--; | |
| 245 } | |
| 246 if (s == s0) | |
| 247 break; | |
| 248 s0 += i; | |
| 249 } | |
| 250 s = s0 + i - 1; | |
| 251 w = hyend; | |
| 252 if (*s0 & HY_BIT) | |
| 253 goto mark; | |
| 254 while (s > s0) { | |
| 255 w--; | |
| 256 if (*s-- & HY_BIT) { | |
| 257 mark: | |
| 258 hyend = w - 1; | |
| 259 if (*s0 & 0100) /* 0100 used in suftab to… | |
| 260 continue; | |
| 261 if (!chkvow(w)) | |
| 262 return(0); | |
| 263 *hyp++ = w; | |
| 264 } | |
| 265 } | |
| 266 if (*s0 & 040) | |
| 267 return(0); | |
| 268 if (exword()) | |
| 269 return(1); | |
| 270 goto again; | |
| 271 } | |
| 272 | |
| 273 int | |
| 274 maplow(int i) | |
| 275 { | |
| 276 if (isupper(i)) | |
| 277 i = tolower(i); | |
| 278 return(i); | |
| 279 } | |
| 280 | |
| 281 int | |
| 282 vowel(int i) | |
| 283 { | |
| 284 switch (i) { | |
| 285 case 'a': case 'A': | |
| 286 case 'e': case 'E': | |
| 287 case 'i': case 'I': | |
| 288 case 'o': case 'O': | |
| 289 case 'u': case 'U': | |
| 290 case 'y': case 'Y': | |
| 291 return(1); | |
| 292 default: | |
| 293 return(0); | |
| 294 } | |
| 295 } | |
| 296 | |
| 297 | |
| 298 Tchar *chkvow(Tchar *w) | |
| 299 { | |
| 300 while (--w >= wdstart) | |
| 301 if (vowel(cbits(*w))) | |
| 302 return(w); | |
| 303 return(0); | |
| 304 } | |
| 305 | |
| 306 | |
| 307 void digram(void) | |
| 308 { | |
| 309 Tchar *w; | |
| 310 int val; | |
| 311 Tchar *nhyend, *maxw; | |
| 312 int maxval; | |
| 313 extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13],… | |
| 314 maxw = 0; | |
| 315 again: | |
| 316 if (!(w = chkvow(hyend + 1))) | |
| 317 return; | |
| 318 hyend = w; | |
| 319 if (!(w = chkvow(hyend))) | |
| 320 return; | |
| 321 nhyend = w; | |
| 322 maxval = 0; | |
| 323 w--; | |
| 324 while (++w < hyend && w < wdend - 1) { | |
| 325 val = 1; | |
| 326 if (w == wdstart) | |
| 327 val *= dilook('a', cbits(*w), bxh); | |
| 328 else if (w == wdstart + 1) | |
| 329 val *= dilook(cbits(*(w-1)), cbits(*w), bxxh); | |
| 330 else | |
| 331 val *= dilook(cbits(*(w-1)), cbits(*w), xxh); | |
| 332 val *= dilook(cbits(*w), cbits(*(w+1)), xhx); | |
| 333 val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx); | |
| 334 if (val > maxval) { | |
| 335 maxval = val; | |
| 336 maxw = w + 1; | |
| 337 } | |
| 338 } | |
| 339 hyend = nhyend; | |
| 340 if (maxval > thresh) | |
| 341 *hyp++ = maxw; | |
| 342 goto again; | |
| 343 } | |
| 344 | |
| 345 int | |
| 346 dilook(int a, int b, char t[26][13]) | |
| 347 { | |
| 348 int i, j; | |
| 349 | |
| 350 i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2]; | |
| 351 if (!(j & 01)) | |
| 352 i >>= 4; | |
| 353 return(i & 017); | |
| 354 } | |
| 355 | |
| 356 | |
| 357 /* here beginneth the tex hyphenation code, as interpreted freely */ | |
| 358 /* the main difference is that there is no attempt to squeeze space */ | |
| 359 /* as tightly at tex does. */ | |
| 360 | |
| 361 static int texit(Tchar *, Tchar *); | |
| 362 static int readpats(void); | |
| 363 static void install(char *); | |
| 364 static void fixup(void); | |
| 365 static int trieindex(int, int); | |
| 366 | |
| 367 static char pats[50000]; /* size ought to be computed dyna… | |
| 368 static char *nextpat = pats; | |
| 369 static char *trie[27*27]; /* english-specific sizes */ | |
| 370 | |
| 371 int texhyphen(void) | |
| 372 { | |
| 373 static int loaded = 0; /* -1: couldn't find tex f… | |
| 374 | |
| 375 if (hyphalg == 0 || loaded == -1) /* non-zero => tex for … | |
| 376 return 0; | |
| 377 if (loaded == 0) { | |
| 378 if (readpats()) | |
| 379 loaded = 1; | |
| 380 else | |
| 381 loaded = -1; | |
| 382 } | |
| 383 return texit(wdstart, wdend); | |
| 384 } | |
| 385 | |
| 386 static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex… | |
| 387 { | |
| 388 int nw, i, k, equal, cnt[500]; | |
| 389 char w[500+1], *np, *pp, *wp, *xpp, *xwp; | |
| 390 | |
| 391 w[0] = '.'; | |
| 392 for (nw = 1; start <= end && nw < 500-1; nw++, start++) | |
| 393 w[nw] = maplow(tolower(cbits(*start))); | |
| 394 start -= (nw - 1); | |
| 395 w[nw++] = '.'; | |
| 396 w[nw] = 0; | |
| 397 /* | |
| 398 * printf("try %s\n", w); | |
| 399 */ | |
| 400 for (i = 0; i <= nw; i++) | |
| 401 cnt[i] = '0'; | |
| 402 | |
| 403 for (wp = w; wp+1 < w+nw; wp++) { | |
| 404 for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; )… | |
| 405 if (pp == 0 /* no trie entry */ | |
| 406 || *pp != *wp /* no match on 1st… | |
| 407 || *(pp+1) != *(wp+1)) /* no match on 2n… | |
| 408 break; /* so move to ne… | |
| 409 equal = 1; | |
| 410 for (xpp = pp+2, xwp = wp+2; *xpp; ) | |
| 411 if (*xpp++ != *xwp++) { | |
| 412 equal = 0; | |
| 413 break; | |
| 414 } | |
| 415 if (equal) { | |
| 416 np = xpp+1; /* numpat */ | |
| 417 for (k = wp-w; *np; k++, np++) | |
| 418 if (*np > cnt[k]) | |
| 419 cnt[k] = *np; | |
| 420 /* | |
| 421 * printf("match: %s %s\n", pp, xpp+1); | |
| 422 */ | |
| 423 } | |
| 424 pp += *(pp-1); /* skip over pattern and n… | |
| 425 } | |
| 426 } | |
| 427 /* | |
| 428 * for (i = 0; i < nw; i++) printf("%c", w[i]); | |
| 429 * printf(" "); | |
| 430 * for (i = 0; i <= nw; i++) printf("%c", cnt[i]); | |
| 431 * printf("\n"); | |
| 432 */ | |
| 433 /* | |
| 434 * for (i = 1; i < nw - 1; i++) { | |
| 435 * if (i > 2 && i < nw - 3 && cnt[i] % 2) | |
| 436 * printf("-"); | |
| 437 * if (cbits(start[i-1]) != '.') | |
| 438 * printf("%c", cbits(start[i-1])); | |
| 439 * } | |
| 440 * printf("\n"); | |
| 441 */ | |
| 442 for (i = 1; i < nw -1; i++) | |
| 443 if (i > 2 && i < nw - 3 && cnt[i] % 2) | |
| 444 *hyp++ = start + i - 1; | |
| 445 return hyp - hyptr; /* non-zero if a hyphen was found */ | |
| 446 } | |
| 447 | |
| 448 /* | |
| 449 This code assumes that hyphen.tex looks like | |
| 450 % some comments | |
| 451 \patterns{ % more comments | |
| 452 pat5ter4ns, 1 per line, SORTED, nothing else | |
| 453 } | |
| 454 more goo | |
| 455 \hyphenation{ % more comments | |
| 456 ex-cep-tions, one per line; i ignore this part for now | |
| 457 } | |
| 458 | |
| 459 this code is NOT robust against variations. unfortunately, | |
| 460 it looks like every local language version of this file has | |
| 461 a different format. i have also made no provision for weird | |
| 462 characters. sigh. | |
| 463 */ | |
| 464 | |
| 465 static int readpats(void) | |
| 466 { | |
| 467 FILE *fp; | |
| 468 char buf[200], buf1[200]; | |
| 469 | |
| 470 if ((fp = fopen(unsharp(TEXHYPHENS), "r")) == NULL | |
| 471 && (fp = fopen(unsharp(DWBalthyphens), "r")) == NULL) { | |
| 472 ERROR "warning: can't find hyphen.tex" WARN; | |
| 473 return 0; | |
| 474 } | |
| 475 | |
| 476 while (fgets(buf, sizeof buf, fp) != NULL) { | |
| 477 sscanf(buf, "%s", buf1); | |
| 478 if (strcmp(buf1, "\\patterns{") == 0) | |
| 479 break; | |
| 480 } | |
| 481 while (fgets(buf, sizeof buf, fp) != NULL) { | |
| 482 if (buf[0] == '}') | |
| 483 break; | |
| 484 install(buf); | |
| 485 } | |
| 486 fclose(fp); | |
| 487 fixup(); | |
| 488 return 1; | |
| 489 } | |
| 490 | |
| 491 static void install(char *s) /* map ab4c5de to: 12 abcde \0 00405… | |
| 492 { | |
| 493 int npat, lastpat; | |
| 494 char num[500], *onextpat = nextpat; | |
| 495 | |
| 496 num[0] = '0'; | |
| 497 *nextpat++ = ' '; /* fill in with count later */ | |
| 498 for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) { | |
| 499 if (isdigit((uchar)*s)) { | |
| 500 num[npat] = *s; | |
| 501 lastpat = npat; | |
| 502 } else { | |
| 503 *nextpat++ = *s; | |
| 504 npat++; | |
| 505 num[npat] = '0'; | |
| 506 } | |
| 507 } | |
| 508 *nextpat++ = 0; | |
| 509 if (nextpat > pats + sizeof(pats)-20) { | |
| 510 ERROR "tex hyphenation table overflow, tail end ignored"… | |
| 511 nextpat = onextpat; | |
| 512 } | |
| 513 num[lastpat+1] = 0; | |
| 514 strcat(nextpat, num); | |
| 515 nextpat += strlen(nextpat) + 1; | |
| 516 } | |
| 517 | |
| 518 static void fixup(void) /* build indexes of where . a b c ... sta… | |
| 519 { | |
| 520 char *p, *lastc; | |
| 521 int n; | |
| 522 | |
| 523 for (lastc = pats, p = pats+1; p < nextpat; p++) | |
| 524 if (*p == ' ') { | |
| 525 *lastc = p - lastc; | |
| 526 lastc = p; | |
| 527 } | |
| 528 *lastc = p - lastc; | |
| 529 for (p = pats+1; p < nextpat; ) { | |
| 530 n = trieindex(p[0], p[1]); | |
| 531 if (trie[n] == 0) | |
| 532 trie[n] = p; | |
| 533 p += p[-1]; | |
| 534 } | |
| 535 /* printf("pats = %d\n", nextpat - pats); */ | |
| 536 } | |
| 537 | |
| 538 static int trieindex(int d1, int d2) | |
| 539 { | |
| 540 int z; | |
| 541 | |
| 542 z = 27 * (d1 == '.' ? 0 : d1 - 'a' + 1) + (d2 == '.' ? 0 : d2 - … | |
| 543 assert(z >= 0 && z < 27*27); | |
| 544 return z; | |
| 545 } |