#define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */
/* this value is used (as a literal) in suftab.c */
/* to encode possible hyphenation points in suffixes. */
/* it could be changed, by widening the tables */
/* to be shorts instead of chars. */
/*
* troff8.c
*
* hyphenation
*/
int hexsize = 0; /* hyphenation exception list size */
char *hbufp = NULL; /* base of list */
char *nexth = NULL; /* first free slot in list */
Tchar *hyend;
i = wp;
while (punct((*i++)))
;
if (!alpha(*--i))
return;
wdstart = i++;
while (alpha(*i++))
;
hyend = wdend = --i - 1;
while (punct((*i++)))
;
if (*--i)
return;
if (wdend - wdstart < 4) /* 4 chars is too short to hyphenate */
return;
hyp = hyptr;
*hyp = 0;
hyoff = 2;
/* for now, try exceptions first, then tex (if hyphalg is non-zero),
then suffix and digram if tex didn't hyphenate it at all.
*/
if (!exword() && !texhyphen() && !suffix())
digram();
/* this appears to sort hyphenation points into increasing order */
*hyp++ = 0;
if (*hyptr)
for (j = 1; j; ) {
j = 0;
for (hyp = hyptr + 1; *hyp != 0; hyp++) {
if (*(hyp - 1) > *hyp) {
j++;
i = *hyp;
*hyp = *(hyp - 1);
*(hyp - 1) = i;
}
}
}
}
static alpha(Tchar i) /* non-zero if really alphabetic */
{
if (ismot(i))
return 0;
else if (cbits(i) >= ALPHABET) /* this isn't very elegant, but there's */
return 0; /* no good way to make sure i is in range for */
else /* the call of isalpha */
return isalpha(cbits(i));
}
again:
i = cbits(*hyend);
if (!alpha(i))
return(0);
if (i < 'a')
i -= 'A' - 'a';
if ((s0 = suftab[i-'a']) == 0)
return(0);
for (;;) {
if ((i = *s0 & 017) == 0)
return(0);
s = s0 + i - 1;
w = hyend - 1;
while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
s--;
w--;
}
if (s == s0)
break;
s0 += i;
}
s = s0 + i - 1;
w = hyend;
if (*s0 & HY_BIT)
goto mark;
while (s > s0) {
w--;
if (*s-- & HY_BIT) {
mark:
hyend = w - 1;
if (*s0 & 0100) /* 0100 used in suftab to encode something too */
continue;
if (!chkvow(w))
return(0);
*hyp++ = w;
}
}
if (*s0 & 040)
return(0);
if (exword())
return(1);
goto again;
}
maplow(int i)
{
if (isupper(i))
i = tolower(i);
return(i);
}
vowel(int i)
{
switch (i) {
case 'a': case 'A':
case 'e': case 'E':
case 'i': case 'I':
case 'o': case 'O':
case 'u': case 'U':
case 'y': case 'Y':
return(1);
default:
return(0);
}
}
Tchar *chkvow(Tchar *w)
{
while (--w >= wdstart)
if (vowel(cbits(*w)))
return(w);
return(0);
}
void digram(void)
{
Tchar *w;
int val;
Tchar *nhyend, *maxw;
int maxval;
extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
again:
if (!(w = chkvow(hyend + 1)))
return;
hyend = w;
if (!(w = chkvow(hyend)))
return;
nhyend = w;
maxval = 0;
w--;
while (++w < hyend && w < wdend - 1) {
val = 1;
if (w == wdstart)
val *= dilook('a', cbits(*w), bxh);
else if (w == wdstart + 1)
val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
else
val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
if (val > maxval) {
maxval = val;
maxw = w + 1;
}
}
hyend = nhyend;
if (maxval > thresh)
*hyp++ = maxw;
goto again;
}
dilook(int a, int b, char t[26][13])
{
int i, j;
i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
if (!(j & 01))
i >>= 4;
return(i & 017);
}
/* here beginneth the tex hyphenation code, as interpreted freely */
/* the main difference is that there is no attempt to squeeze space */
/* as tightly at tex does. */
static int texit(Tchar *, Tchar *);
static int readpats(void);
static void install(char *);
static void fixup(void);
static int trieindex(int, int);
int texhyphen(void)
{
static int loaded = 0; /* -1: couldn't find tex file */
if (hyphalg == 0 || loaded == -1) /* non-zero => tex for now */
return 0;
if (loaded == 0) {
if (readpats())
loaded = 1;
else
loaded = -1;
}
return texit(wdstart, wdend);
}
static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex, return # found */
{
int nw, i, k, equal, cnt[500];
char w[500+1], *np, *pp, *wp, *xpp, *xwp;
w[0] = '.';
for (nw = 1; start <= end && nw < 500-1; nw++, start++)
w[nw] = maplow(tolower(cbits(*start)));
start -= (nw - 1);
w[nw++] = '.';
w[nw] = 0;
/*
* printf("try %s\n", w);
*/
for (i = 0; i <= nw; i++)
cnt[i] = '0';
for (wp = w; wp+1 < w+nw; wp++) {
for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
if (pp == 0 /* no trie entry */
|| *pp != *wp /* no match on 1st letter */
|| *(pp+1) != *(wp+1)) /* no match on 2nd letter */
break; /* so move to next letter of word */
equal = 1;
for (xpp = pp+2, xwp = wp+2; *xpp; )
if (*xpp++ != *xwp++) {
equal = 0;
break;
}
if (equal) {
np = xpp+1; /* numpat */
for (k = wp-w; *np; k++, np++)
if (*np > cnt[k])
cnt[k] = *np;
/*
* printf("match: %s %s\n", pp, xpp+1);
*/
}
pp += *(pp-1); /* skip over pattern and numbers to next */
}
}
/*
* for (i = 0; i < nw; i++) printf("%c", w[i]);
* printf(" ");
* for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
* printf("\n");
*/
/*
* for (i = 1; i < nw - 1; i++) {
* if (i > 2 && i < nw - 3 && cnt[i] % 2)
* printf("-");
* if (cbits(start[i-1]) != '.')
* printf("%c", cbits(start[i-1]));
* }
* printf("\n");
*/
for (i = 1; i < nw -1; i++)
if (i > 2 && i < nw - 3 && cnt[i] % 2)
*hyp++ = start + i - 1;
return hyp - hyptr; /* non-zero if a hyphen was found */
}
/*
This code assumes that hyphen.tex looks like
% some comments
\patterns{ % more comments
pat5ter4ns, 1 per line, SORTED, nothing else
}
more goo
\hyphenation{ % more comments
ex-cep-tions, one per line; i ignore this part for now
}
this code is NOT robust against variations. unfortunately,
it looks like every local language version of this file has
a different format. i have also made no provision for weird
characters. sigh.
*/
static int readpats(void)
{
FILE *fp;
char buf[200], buf1[200];