From:   RUSVX2::Jnet%"A4422DAB@AWIUNI11"      "Erich Neuwirth" 19-NOV-1990 01:18:52.20
To:     Eberhard Mattes <DAAR1034@DS0RUS54>, 'Stefan Momma' <NBAB1424@DS0RUS54>, Harald Krieger <VCU11671@DS0RUS54>, 'Barbara Burr' <ZRFN0370@DS0RUS54>
CC:
Subj:

Received: From DEARN(MAILER) by DS0RUS54 with Jnet id 1075
         for ZRFN0370@DS0RUS54; Mon, 19 Nov 90 01:17 A
Received: by DEARN (Mailer R2.07) id 0854; Mon, 19 Nov 90 01:16:15 CET
Date:         Mon, 19 Nov 90 01:05:22 MEZ
Reply-To:     German TeX Users Communication List <TEX-D-L@DEARN>
Sender:       German TeX Users Communication List <TEX-D-L@DEARN>
From:         Erich Neuwirth <A4422DAB@AWIUNI11>
To:           Eberhard Mattes <DAAR1034@DS0RUS54>,
             'Stefan Momma' <NBAB1424@DS0RUS54>,
             Harald Krieger <VCU11671@DS0RUS54>,
             'Barbara Burr' <ZRFN0370@DS0RUS54>


======================================================================== 491
Date:         Sun, 18 Nov 90 23:27:16 WUT
From:         Gustaf Neumann <NEUMANN@AWIWUW11>
Subject:      Programm zur Konvertierung von Umlauten
To:           TEX-D-L@DEARN


Nachstehend folgt ein Lex-Programm zur Konvertierung von Texten mit
'Ascii-German'-Umlauten (Umlaute werden als Ae, Oe, ... geschrieben) in
Texte mit (TeX)-Umlauten. Das Programm ist bei weitem noch nicht
perfekt (und wird es auch nie sein), doch ist vielleicht fuer einige
doch nuetzlich. Ich konnte mit dem Programm den Text des Buches

\bibitem[{Neu88}]{neumann88}
       G.~Neumann: \T{Metaprogrammierung und Prolog},
       Addison--Wesley, Bonn 1988.

zur Gaenze fehlerfrei umsetzen (Ende der Werbeeinschaltung).

Bekannte Problemkinder sind "Masse" ("im hohen Masse" versus
"Gesteinsmasse" ) und "Busse" ("Autobusse" vs. "tuet Busse"). In beiden
Faellen wird die jeweils erste Variante als richtig angenommen,
die anderen Alternativen erreicht man durch "Gesteinsmas{}se" bzw. durch
"tuet Bu{}sse". Ich nehme gerne noch weitere Ausnahmeregeln entgegen.

-Gustaf neumann
-------------------------------------------------------------------
Gustaf Neumann       [email protected], [email protected]
Vienna University of Economics and Business Administration
Augasse 2-6,  A-1090 Vienna, Austria
Tel: +43 (222) 31-336 x4533     Fax 347-555

------------------------------------- cut here -----diac.shar-----------
# This is a shell archive.  Remove anything before this line,
# then unpack it by saving it in a file and typing "sh file".
#
# Wrapped by neumann on Sun Nov 18 23:20:06 1990
#
# This archive contains:
#       diac.l          Makefile        diacaux.c       diacaux.h
#

LANG=""; export LANG

echo x - diac.l
cat >diac.l <<'@EOF'
%{

/* diac.l
* lex file for converting Ascii German into diacritical German
* Version 1.0 written by
*  Dorai Sitaram, Rice University, 1990   [email protected]
*
* Version 1.1:
* General rewrite, using some Material from
*    H.Kaeslin, Behandlung der Umlaute bei der Verarbeitung deutscher
*    Texte unter Unix, in: it, Vol 1, 1988
* and Duden - die Rechtschreibung.
*
* Gustaf Neumann, Wirtschaftsuniversitaet Wien, October 1990
* [email protected]         [email protected]
*
* The resulting LaTeX file uses german.sty!
* Representation of umlaut characters:    \"a \"A \"o \"O \"u \"U {\ss}
* The style file german.sty would allow    "a  "A  "o  "O  "u  "U "s
* as well, but the latter representation makes it impossible to
* to distinguish between umlaut characters and quoted text. This distinction
* is necessesay in cases where quotes should be changed into opening and
* closing german quotes (\glqq and \qrqq) in an automated way (another
* lex program).
*
* If you do NOT want to use GERMAN.STY, replace underneath the ruleset
* for \documentstyle with the following rule:

\\documentstyle[^\}]*\} { printf("%s\n", yytext);
                       printf("\\newskip\\zeeskip\n");
                       printf("\\zeeskip=0pt plus0pt minus0pt\n");
                       printf("\\def\\1{\\nobreak\\hskip\\zeeskip}\n");
                       printf("\\let\\umlaut\\\"\n");
                       printf("\\def\\\"#1{\\1\\umlaut#1\\1}\n");
                       printf("\\let\\oldss\\ss\n");
                       printf("\\def\\ss{\\1\\oldss\\1}\n"); }
*
*
* To prevent the conversion from Ascii German into diacritical German,
* it is necessary to insert empty groups into the words (e.g. Ka{}eslin).
*/

#include "diacaux.h"
int i;
%}

%p 6500
%n 1000
%e 2500
%a 4000
%k 2500
%o 3500


V       [AEIOUaeiou]
C       [B-DF-HJ-NP-TV-Zb-df-hj-np-tv-z]
W       [ "'\t\n,;!?().]
b       [ \t\n]

%%

\\documentstyle{b}*\{   printf("\\documentstyle[german]{");
\\documentstyle{b}*\[.*german.*\]{b}*\{   ECHO;
\\documentstyle{b}*\[.*\]{b}*\{ {
                         for(i=13;yytext[i]=='[';i++);
                         printf("\\documentstyle[german,%s",&yytext[i+2]);}



\\input{b}*\{[^\}]+\}   { texfile = getfilenamebrack(&yytext[6]);
                       tempfile = maketempfilename(texfile);
                       printf("\\input{%s}", tempfile);
                       dosubdiac(texfile, tempfile); }

\\input{b}*[^ \t\n]+    { texfile = getfilename(&yytext[6]);
                       tempfile = maketempfilename(texfile);
                       printf("\\input %s", tempfile);
                       dosubdiac(texfile, tempfile); }

\\begin\{.+\}   ECHO;
\\end\{.+\}     ECHO;
\\[A-Za-z]+     ECHO;


%{ /* ue */
%}

[Rr]euessier    printf("%ce\\\"ussier", yytext[0]);
[^igGbB][Ee]ue  ECHO;
[QqAa]ue        ECHO;
[Uu]e[iu]       ECHO;
[Gg]etue{W}     ECHO;
[a-rt-z]tuend   ECHO;
{W}tuet{W}      ECHO;
[Nn]ichtstuend  ECHO;
[Nn]ichtstuer   ECHO;
Tuerei{W}       ECHO;
[a-z]tuerei     ECHO;
[a-z]tuerisch   ECHO;
[Aa]bzue[b-z][a-z]*[elr]n       ECHO;
[Aa]nzue[b-z][a-z]*[elr]n       ECHO;
[Aa]u[fs]zue[b-z][a-z]*[elr]n   ECHO;
[Ee]inzue[b-z][a-z]*[elr]n      ECHO;
[Hh]inzue[b-z][a-z]*[elr]n      ECHO;
[Mm]itzue[b-z][a-z]*[elr]n      ECHO;
[Nn]achzue[b-z][a-z]*[elr]n     ECHO;
[Vv]orzue[b-z][a-z]*[elr]n      ECHO;
[Ww]iederzue[b-z][a-z]*[elr]n   ECHO;
[Zz]ue[b-z][a-z]*[elr]n         ECHO;
[Zz]urueckzue[b-z][a-z]*[elr]n  printf("%cur\\\"uckzu%s",yytext[0],&yytext[9]);
tuendere        ECHO;
[Aa]biguen      ECHO;
[Aa]ffluen      ECHO;
[Bb]u[ea]nos    ECHO;
[Dd]uett        ECHO;
[Dd]uell        ECHO;
entuell         ECHO;
[Gg]raduell     ECHO;
[Gg]uerill      ECHO;
[Ii]ndividuen   ECHO;
[Ii]nfluen      ECHO;
Lueger          ECHO;
[krx]tuell      ECHO;
[Kk]ongruen     ECHO;
[Kk]onstituen   ECHO;
[Mm]enuett      ECHO;
[Mm]anuell      ECHO;
[Mm]igue[tl]    ECHO;
[Pp]irouett     ECHO;
[Pp]uerto       ECHO;
[Rr]esiduen     ECHO;
[Ss]tatue       ECHO;
[Ss]exuell      ECHO;
[Ss]uez         ECHO;
[Vv]enezuel     ECHO;
[Vv]isuell      ECHO;
[Zz]uerkannt    ECHO;
[Zz]uerteil     ECHO;
[Zz]uerst       ECHO;


%{ /* ae */
%}

[Aa]ero         ECHO;
[Dd]odekae      ECHO;
[Hh]exae        ECHO;
[Ii]kosae       ECHO;
[Ii]srael       ECHO;
[Kk]afkaesk     ECHO;
aeuel           printf("\\\"auel");
[Mm]ichael      ECHO;
[Mm]etae        ECHO;
[Oo]ctae        ECHO;
[Pp]entae       ECHO;
[Pp]harmae      ECHO;
[Rr]affael      ECHO;
[Rr]afael       ECHO;
[Rr]aphael      ECHO;
[Tt]etrae       ECHO;
[Tt]hemae       ECHO;
[Ss]chemae      ECHO;
[Ss]amuel       ECHO;
[Vv]alue{W}     ECHO;
[Tt]rue{W}      ECHO;


%{ /* oe */
%}

[Aa]utoe        ECHO;
[Bb]enzoe       ECHO;
[Cc]hemoe       ECHO;
[Dd]iarrhoea    ECHO;
[Ee]lektroe     ECHO;
[Gg]oethe       ECHO;
[Hh]eroen       ECHO;
[Hh]o[ml]oe     ECHO;
[Hh]ydroe       ECHO;
[Ii]ndoeuro     ECHO;
Joel            ECHO;
[Kk]inoe        ECHO;
[Kk]oedukat     ECHO;
[Kk]oeffizi     ECHO;
[Kk]oerzi       ECHO;
[Kk]oexist      ECHO;
[Cc]oexist      ECHO;
[Kk]oenzym      ECHO;
[Kk]ontoe       ECHO;
[Ss]oeben       ECHO;
Soest           ECHO;
[Mm]etazoe      ECHO;
[Mm][ai][ck]roe ECHO;
[Mm]onoe        ECHO;
[Nn]euroe       ECHO;
[Oo]boe         ECHO;
[Oo]erlikon     ECHO;
[Oo]ldesloe     ECHO;
[Oo]kto         ECHO;
[Oo]pto         ECHO;
[Pp]oesie       ECHO;
[Pp]oebene      ECHO;
[Pp]iezo        ECHO;
[Pp]hoto        ECHO;
[Pp]hysioe      ECHO;
[Pp]oe[mt]i     ECHO;
[Pp]oe[mt][^a-z]        ECHO;
[Pp]orto        ECHO;
[Pp]roenzy      ECHO;
[Pp]roto        ECHO;
[Pp]rotozoe     ECHO;
[Pp]seudo       ECHO;
[Pp]sycho       ECHO;
[Pp]yro         ECHO;
[Rr]adio        ECHO;
[Tt]otoer       ECHO;
[Tt]urbo        ECHO;
[Vv]ideo        ECHO;


%{ /* ss */
%}

{V}sss          printf("%c{\\ss}s",yytext[0]);
[EeAu][iu]ss    printf("%c%c{\\ss}", yytext[0],yytext[1]);
{C}{V}sser{W}   ECHO;
{C}{V}sser{V}   ECHO;
{C}{V}ssen      ECHO;
[^r]uesse[ln]   printf("%c\\\"usse%c",yytext[0],yytext[6]);
luesse          printf("l\\\"usse");
iess            printf("ie{\\ss}");
ssung           ECHO;
ssel            ECHO;
ssoren          ECHO;
ssiez           ECHO;
ccess           ECHO;
ssidy           ECHO;
chss            ECHO;
ssch            ECHO;
sspr            ECHO;
ssier           ECHO;
nisse           ECHO;
lss             ECHO;
ss'             ECHO;
tionss          ECHO;
tss             ECHO;
ussisch         ECHO;
ungss           ECHO;
usserl{W}       ECHO;
[Aa]ssoz        ECHO;
[Aa]ssist       ECHO;
[Aa]ssemb       ECHO;
[Aa]uss[^e]     ECHO;
[Aa]usse[^rn]   ECHO;
[Aa]ussende     ECHO;
[Ee]sse         ECHO;
[Bb]isschen     printf("%ci{\\ss}chen", yytext[0]);
[Bb]usiness     ECHO;
[Bb]usse        ECHO;
[Bb]ussard      ECHO;
triebss         ECHO;
beitss          ECHO;
[Dd]iskussion   ECHO;
[Dd]issert      ECHO;
[Dd]asselb      ECHO;
[Ee]ssi         ECHO;
[Ff]lusse       ECHO;
[Ff]luess[ie]   printf("%cl\\\"uss%c", yytext[0],yytext[6]);
Grass           ECHO;
[Gg]enosse      ECHO;
[Gg]rosse       printf("%cro{\\ss}e",yytext[0]);
[Ii]nteress     ECHO;
[Kk]lass[ie]    ECHO;
[Kk]assette     ECHO;
[Ll]asse        ECHO;
[Ll]aessig      printf("%c\\\"assig", yytext[0]);
[Mm]assa[^nr]   ECHO;
[Mm]asseu       ECHO;
[Mm]isser{C}    printf("%ci{\\ss}er%c", yytext[0],yytext[6]);
[Mm]iss[ei]     ECHO;
[Ee]rmassen     printf("%crma{\\ss}en", yytext[0]);
[Mm]assi        ECHO;
[Pp]rivatissi   ECHO;
[Pp]assiv       ECHO;
[Pp]rozessor    ECHO;
[Ss]tossen      printf("%cto{\\ss}en", yytext[0]);
[Rr]essource    ECHO;
[Ww][ia]sse     ECHO;

{C}ss{C}        ECHO;

[AaOoUu]e       printf("\\\"%c", yytext[0]);
ss              printf("{\\ss}");

@EOF

chmod 644 diac.l

echo x - Makefile
cat >Makefile <<'@EOF'
#
# if you do not have flex available, deactivate the definitions of
# LEX and LEXLIB; The program compiled with flex works also with the
# standard lex library (-ll).
#
LEX=flex
LEXLIB=-lfl
PROGS= diac

all: ${PROGS}

diac: diac.l diacaux.h diacaux.c
       ${LEX} ${LFLAGS} diac.l
       cc -O ${DEFINES} -o $@ diacaux.c lex.yy.c ${LEXLIB}
       strip $@
       rm lex.yy.c lex.yy.o diacaux.o

clean:
       rm -f ${PROGS} *.o *~ #* core


shar:
       shar diac.l Makefile diacaux.c diacaux.h > diac.shar
@EOF

chmod 644 Makefile

echo x - diacaux.c
cat >diacaux.c <<'@EOF'
/* diacaux.c
* to be linked with lex.yy.c from diac.l
* written by Dorai Sitaram, Rice University, 1990
*/

#include "diacaux.h"

int slen(s)
char *s;
{
 int i;

 for (i = 1; s[i] != '\0'; i++)
       ;

 return i;
}

char *strap(s,t)
char *s,*t;
{
 char *r = (char *) malloc(slen(s) + slen(t));
 int i,j;

 for (i = 0; s[i] != '\0'; i++)
       r[i] = s[i];

 for (j = 0; t[j] != '\0'; i++, j++)
       r[i] = t[j];

 r[i] = '\0';

 return r;
}

char *getfilename(s)
char *s;
{
 char *r = (char *) malloc(slen(s));
 int i,j;

 for (i = 0; s[i] == ' ' || s[i] == '\t' || s[i] == '\n'; i++)
       ;

 for (j = 0; s[i] != '\0'; i++, j++)
       r[j] = s[i];

 r[j] = '\0';

 return r;
}

char *getfilenamebrack(s)
char *s;
{
 char *r = (char *) malloc(slen(s));
 int i,j;

 for (i = 0; s[i] == ' ' || s[i] == '\t' || s[i] == '\n' ||
               s[i] == '{'; i++)
       ;

 for (j = 0; s[i] != '}'; i++, j++)
       r[j] = s[i];

 r[j] = '\0';

 return r;
}

char *maketempfilename(s)
char *s;
{
 char *r = (char *)malloc(slen(s));
 int i,j;

 for (i = 0, j = 0; s[j] != '\0'; i++, j++) {
       r[i] = s[j];
       if (r[i] == '/') r[i] = '_';
 }

 r[i] = '\0';

 return strap("/tmp/",r);
}

void dosubdiac(s,t)
char *s,*t;
{
 system(strap("diac <",
       strap(texfile,
       strap(" > ", tempfile))));
}
@EOF

chmod 644 diacaux.c

echo x - diacaux.h
cat >diacaux.h <<'@EOF'
/* diac.h
* to be included in diac.l and diac.c
* written by Dorai Sitaram, Rice University, 1990
*/

char *texfile;
char *tempfile;
int slen();
char *strap();
char *getfilename();
char *getfilenamebrack();
char *maketempfilename();
void dosubdiac();
@EOF

chmod 644 diacaux.h

exit 0

ERICH NEUWIRTH
BITNET (EARN): A4422DAB@AWIUNI11
INTERNET:      [email protected]
Intitute for Statistics and Computer Science
UNIVERSITY OF VIENNA, UNIVERSITAETSSTR. 5/9, A-1010 VIENNA, AUSTRIA