%% This OTP is part of the Makor package (version2) for typesetting

%% This OTP is part of the Makor package (version2) for typesetting
%% Hebrew with Omega (Dec, 2002). Its purpose is to mark word
%% beginnings and endings with special flags---sequences of
%% characters---that mark these boundaries. Subsequent filters can
%% then strip off these markers, and do whatever else at these
%% boundaries. Most notably, we'll substitute certain glyphs by word
%% final or word initial forms.
%%
%% It's not the purpose of this OTP to perform strict lexical analysis
%% as to what constitutes a valid input of a Hebrew word; other
%% filters do that at subsequent stages. This current filter shall
%% immediately precede `m2context', so that any contextual analysis
%% done before this needs to be done separately by a foregoing OTP.
%% This will not be a problem, except for Yiddish (and possibly other
%% Hebrew-whatever dialects).
%%
%% For our purposes, this OTP examines input before the Makor
%% transcription rules have been changed. Therefore, at this point,
%% Hebrew words can consist of sequences of Ascii chars drawn from
%% this set:
%% lowercase consonants;
%% lowercase vowels;
%% uppercase `O';
%% special chars + | * ^ ` ' _
%% special special chars : .
%% Some comments on the special specials! The colon when followed by
%% certain vowels (+, e, a) is a hataf-vowel; otherwise, it is a colon. The
%% period is a period unless it is immediately followed by `s' or `t'.

input:
1;
output:
1;
states:
IN, % are we inside a word?
OUT % are we between words?
;
aliases:
OKALPH = (`b'|`c'|`d'|`i'|`m'|`n'|`p'|`s'|`t')
; % only these needed to specify units (bp, cm, pt, etc)
ARG = (43|45|46|48-57|{OKALPH}
) % digits, -, +, OK letters are valid
;
U = 95 % underscore
;
C = 94 % circumflex
;
COMMA = 44
;
SEMI = 59 % semicolon
;
% 39=right apostrophe; 96=left apostrophe
LETTER = (`a'-`z'|`O'|`+'|`|'|`*'|`^'|`_'|`"'|
`['|`]'|39|96| % reg Hebrew + trope
128-135|197) % Yiddish, Ladino glyphs
;
SPECIAL = (`:'|46) % 46=period
;
AUGMENTED = ({LETTER}|{SPECIAL})
;
L = ({LETTER})
;
NL = ^({LETTER})
;
A = ({AUGMENTED})
;
NA = ^({AUGMENTED})
;
%% (As of late December, 2002, version 1.15 of Omega will not work
%% when both categories of letters---that is, LETTER and AUGMENTED---
%% are used. We therefore use the
%% less restrictive version, and hope that we can use explicit pattern
%% matching in m2context.otp to fix the lacks. We also that Omega will
%% soon be fixed!

expressions:
%% Before we do some word parsing, note that _ and ^ are also used to
%% delimit fine tunign and various adjustments. Thus, we need to
%% match this stuff first...

%% First let's do the accent fine-tuning. Fine Tuning consists of a
%% pair of dimens, which are used as coordinates to adjust the
%% placement of an accent or cantillation mark. Tunings can consist
%% of a PAIR of coordinates. In case a letter has both an accent and
%% a trope, the first pair applies to the vowel+trope unit, while the
%% second applies to the displacement of the second diacrit from the
%% first.
%%
{U} {ARG}<1,9>{COMMA}{ARG}<1,9>{SEMI}{ARG}<1,9>{COMMA}{ARG}<1,9> {U}
=> "{\clearocplists\lowertuning[" \(* + 1 - 1) "]}"
<= <push: OUT>
;
{C} {ARG}<1,9>{COMMA}{ARG}<1,9>{SEMI}{ARG}<1,9>{COMMA}{ARG}<1,9> {C}
=> "{\clearocplists\uppertuning[" \(* + 1 - 1) "]}"
<= <push: OUT>
;
{U} {ARG}<1,9>{COMMA}{ARG}<1,9> {U}
=> "{\clearocplists\lowertune[" \(* + 1 - 1) "]}"
<= <push: OUT>
;
{C} {ARG}<1,9>{COMMA}{ARG}<1,9> {C}
=> "{\clearocplists\uppertune[" \(* + 1 - 1) "]}"
<= <push: OUT>
;
%% An adjustment is a single number, with or without a decimal, which
%% represents the fraction of the width of the glyph by which the
%% accent is displaced. It is much less powerful than a tuning, but
%% is sometimes useful.
%%
{U} {ARG}<1,9> {U} =>
"{\clearocplists\loweradj{" \(* + 1 - 1) "}}"
<= <push: OUT>
;
{C} {ARG}<1,9> {C} =>
"{\clearocplists\upperadj{" \(* + 1 - 1) "}}"
<= <push: OUT>
;

%% A `word' is a sequence of zero or more ` augmented letters' ended by
%% at least one regular letter (and terminated by a space, punct, etc).
%%
%% We use the sequence `)nullchar' and `nullchar(' as the end/start of
%% word markers. (Nullchar=char254)
%%
<IN> {A} end: => \* `)' 254 % eow
;
<OUT>{A} => 254 `(' % sow
<= \* <push: IN>
;
<IN>{NA} => `)' 254 % eow
<= \* <push: OUT>
;
{A} => 254 `(' % sow
<= \* <push: IN>
;
{NA} =>
<= \* <push: OUT>
;