GopherProxy

# koi8-plaintex.rus - KOI8 to PlainTeX conversion for translit
#
# version 1.0
#
# Created by Alexander L. Belikoff, 1997

# The TeX tranliteration sequences follow AMS cyrillic convention for
# WNCYR fonts with cyracc.def file
# To be used with translit.c program by Jan Labanowski. For a format of
# this file consult translit documentation
# Processed the KOI8 encoced file by translit. For example:
# translit -i myfile.ko8 -o myfile.tex -t koi8-tex.rus
# Then process it with Plain TeX by using:
# tex myfile.tex
# and then use your favorite dvi2something program. E.g., for PostScript use:
# dvips myfile.dvi
#

1 file version number

" " # string delimiters
[ ] # list delimites
{ } # regular expression delimiters

# starting sequence for TeX
"\input cyracc.def
\font\tencyr=wncyr10
\def\cyr{\tencyr\cyracc}
\obeylines
\obeyspaces
"

#ending sequence
"
\bye
"

0 # number of input SHIFT sequences, only one set of input characters

2 # number of output SHIFT sequences, two sets of input characters

# SHIFT-OUT SHIFT-IN
"" "" #shift sequences for set 1 (Latin)
"{\cyr " "}" #cyrillic enclosed in {\cyr ... }

# conversion table
# inp_set inp_seq out_set out_seq

# characters which are not in ASCII (and DEL) and not in KOI8 to *
0 [\0x7F-\0xA2\0xA4-\0xB2\0xB4-\0xBF] 0 "$\star$"

# dehyphenate words, e.g. con- (NL)cert is changed to concert(NL)
# Below is a complicated (?) regular expression. It joins a hyphenated
# word. It looks for one of more letters (saves them as substring 1)
# followed by a hyphen (which may be followed by zero or more spaces
# or tabs). The hyphen must be followed by a NewLine (characters 0A-0D hex
# are various new line sequences) and saves NewLine sequence. Then it looks
# for zero or more tabs and spaces (at the beginning of the line). Then it
# looks for the rest of the hyphenated word and saves it as substring 3.
# The word may have punctuation attached. Then it looks again for some spaces
# or tabs. The substitute string junks all sequences which were not withn (),
# i.e., hyphen and spaces/tabs and inserts only substrings but in a different
# order. The 1 (word beginning) is followed by 3 (word end) and followed by
# the NewLine. The {\2\1\3} would be equally good. The string is then returned
# back for processing (output code is -1). Note that since input regular
# expression is very long, I chopped it into several lines by using \NL.
# If \ is followed by a white space, the \ and all white space which follow it
# is removed by the program. Be carefull not to use "\white_space" in strings,
# lists or regular expressions. If you must, enter \ as a code (i.e., \0x5C).

# uncomment lines below if you want to dehyphenate

# 0 {([A-Za-z\0xA3\0xB3\0xC0-\0xFF]+)-[ \0x09]*([\0x0A-\0x0D]+)[ \0x09]*(\
# [A-Za-z\0xA3\0xB3\0xC0-\0xFF,.?;:")'`!]+)[ \0x09]}
# -1 {\1\3\2}

# All latin letters are converted to the same letters but with the output
# set 1
0 [A-Za-z] 1 [A-Za-z] #Latin letters A-Z and a-z

# Add \medskip before the blank lines
0 {([\0x0B-\0x0D]*)\0x0A[ \0x09]*\0x0A([\0x0B-\0x0D]*)} 0 {\\medskip\1\0x0A\2}

# Quote some special TeX characters

# these do not require going out of {\cyr ....}
0 "[" 0 "$[$"
0 "]" 0 "$]$"
0 "^" 0 "$\wedge$"
0 "{" 0 "$\lbrace$"
0 "}" 0 "$\rbrace$"
0 "~" 0 "$\sim$"
0 "\" 0 "$\backslash$"
0 "|" 0 "$\mid$"
0 "*" 0 "$\star$"
0 "<" 0 "$<$"
0 ">" 0 "$>$"
0 "$" 0 "\$"
0 "%" 0 "\%"

# these can be represented correctly only in Latin charset
0 "_" 1 "\_"
0 "&" 1 "\&"
0 "#" 1 "\#"
0 "@" 1 "@"

# Cyrillic letters
0 "\0xF4\0xFD" 2 "T{\cydot}Shch" # to prevent C
0 "\0xF4\0xDD" 2 "T{\cydot}shch" # to prevent C
0 "\0xD4\0xFD" 2 "t{\cydot}Shch" # to prevent C
0 "\0xD4\0xDD" 2 "t{\cydot}shch" # to prevent C

0 "\0xF4\0xFB" 2 "T{\cydot}Sh" # to prevent C
0 "\0xF4\0xDB" 2 "T{\cydot}sh" # to prevent C
0 "\0xD4\0xFB" 2 "t{\cydot}Sh" # to prevent C
0 "\0xD4\0xDB" 2 "t{\cydot}sh" # to prevent C

0 "\0xF4\0xF3" 2 "T{\cydot}S" # to prevent C
0 "\0xF4\0xD3" 2 "T{\cydot}s" # to prevent C
0 "\0xD4\0xF3" 2 "t{\cydot}S" # to prevent c
0 "\0xD4\0xD3" 2 "t{\cydot}s" # to prevent c

0 "\0xA3" 2 "\\0o42e" # small \"e (yo)
0 "\0xB3" 2 "\\0o42E" # capital \"E (Yo)
0 "\0xE1" 2 "A"
0 "\0xE2" 2 "B"
0 "\0xF7" 2 "V"
0 "\0xE7" 2 "G"
0 "\0xE4" 2 "D"
0 "\0xE5" 2 "E"
0 "\0xF6" 2 "Zh"
0 "\0xFA" 2 "Z"
0 "\0xE9" 2 "I"
0 "\0xEA" 2 "{\u I}" # I kratkoje
0 "\0xEB" 2 "K"
0 "\0xEC" 2 "L"
0 "\0xED" 2 "M"
0 "\0xEE" 2 "N"
0 "\0xEF" 2 "O"
0 "\0xF0" 2 "P"
0 "\0xF2" 2 "R"
0 "\0xF3" 2 "S"
0 "\0xF4" 2 "T"
0 "\0xF5" 2 "U"
0 "\0xE6" 2 "F"
0 "\0xE8" 2 "Kh"
0 "\0xE3" 2 "C"
0 "\0xFE" 2 "Ch"
0 "\0xFB" 2 "Sh"
0 "\0xFD" 2 "Shch"
0 "\0xFF" 2 "{\Cdprime}" # Tverdyj znak
0 "\0xF9" 2 "Y"
0 "\0xF8" 2 "{\Cprime}" # Myagkij znak
0 "\0xFC" 2 "\`E"
0 "\0xE0" 2 "Yu"
0 "\0xF1" 2 "Ya"
0 "\0xC1" 2 "a"
0 "\0xC2" 2 "b"
0 "\0xD7" 2 "v"
0 "\0xC7" 2 "g"
0 "\0xC4" 2 "d"
0 "\0xC5" 2 "e"
0 "\0xD6" 2 "zh"
0 "\0xDA" 2 "z"
0 "\0xC9" 2 "i"
0 "\0xCA" 2 "{\u i}"
0 "\0xCB" 2 "k"
0 "\0xCC" 2 "l"
0 "\0xCD" 2 "m"
0 "\0xCE" 2 "n"
0 "\0xCF" 2 "o"
0 "\0xD0" 2 "p"
0 "\0xD2" 2 "r"
0 "\0xD3" 2 "s"
0 "\0xD4" 2 "t"
0 "\0xD5" 2 "u"
0 "\0xC6" 2 "f"
0 "\0xC8" 2 "kh"
0 "\0xC3" 2 "c"
0 "\0xDE" 2 "ch"
0 "\0xDB" 2 "sh"
0 "\0xDD" 2 "shch"
0 "\0xDF" 2 "{\cdprime}"
0 "\0xD9" 2 "y"
0 "\0xD8" 2 "{\cprime}"
0 "\0xDC" 2 "\`e"
0 "\0xC0" 2 "yu"
0 "\0xD1" 2 "ya"