%% This OTP is part of the Makor package (version2) for typesetting
%% Hebrew with Omega (Dec, 2002).  Its purpose is to mark word
%% beginnings and endings with special flags---sequences of
%% characters---that mark these boundaries.  Subsequent filters can
%% then strip off these markers, and do whatever else at these
%% boundaries.  Most notably, we'll substitute certain glyphs by word
%% final or word initial forms.
%%
%% It's not the purpose of this OTP to perform strict lexical analysis
%% as to what constitutes a valid input of a Hebrew word; other
%% filters do that at subsequent stages.  This current filter shall
%% immediately precede `m2context', so that any contextual analysis
%% done before this needs to be done separately by a foregoing OTP.
%% This will not be a problem, except for Yiddish (and possibly other
%% Hebrew-whatever dialects).
%%
%% For our purposes, this OTP examines input before the Makor
%% transcription rules have been changed.  Therefore, at this point,
%% Hebrew words can consist of sequences of Ascii chars drawn from
%% this set:
%%      lowercase consonants;
%%      lowercase vowels;
%%      uppercase `O';
%%      special chars + | * ^ ` ' _
%%      special special chars : .
%% Some comments on the special specials!  The colon when followed by
%% certain vowels (+, e, a) is a hataf-vowel; otherwise, it is a colon.  The
%% period is a period unless it is immediately followed by `s' or `t'.

input:
       1;
output:
       1;
states:
       IN,             % are we inside a word?
       OUT             % are we between words?
       ;
aliases:
       OKALPH  = (`b'|`c'|`d'|`i'|`m'|`n'|`p'|`s'|`t')
               ; % only these needed to specify units (bp, cm, pt, etc)
       ARG     = (43|45|46|48-57|{OKALPH}
                       ) % digits, -, +, OK letters are valid
               ;
       U       = 95 % underscore
               ;
       C       = 94 % circumflex
               ;
       COMMA   = 44
               ;
       SEMI    = 59 % semicolon
               ;
       % 39=right apostrophe; 96=left apostrophe
       LETTER  = (`a'-`z'|`O'|`+'|`|'|`*'|`^'|`_'|`"'|
                       `['|`]'|39|96| % reg Hebrew + trope
                       128-135|197) % Yiddish, Ladino glyphs
               ;
       SPECIAL = (`:'|46) % 46=period
               ;
       AUGMENTED       = ({LETTER}|{SPECIAL})
                       ;
       L       = ({LETTER})
               ;
       NL      = ^({LETTER})
               ;
       A       = ({AUGMENTED})
               ;
       NA      = ^({AUGMENTED})
               ;
%% (As of late December, 2002, version 1.15 of Omega will not work
%% when both categories of letters---that is, LETTER and AUGMENTED---
%% are used.  We therefore use the
%% less restrictive version, and hope that we can use explicit pattern
%% matching in m2context.otp to fix the lacks.  We also that Omega will
%% soon be fixed!

expressions:
%% Before we do some word parsing, note that _ and ^ are also used to
%% delimit fine tunign and various adjustments.  Thus, we need to
%% match this stuff first...

%% First let's do the accent fine-tuning.  Fine Tuning consists of a
%% pair of dimens, which are used as coordinates to adjust the
%% placement of an accent or cantillation mark.  Tunings can consist
%% of a PAIR of coordinates.  In case a letter has both an accent and
%% a trope, the first pair applies to the vowel+trope unit, while the
%% second applies to the displacement of the second diacrit from the
%% first.
%%
       {U} {ARG}<1,9>{COMMA}{ARG}<1,9>{SEMI}{ARG}<1,9>{COMMA}{ARG}<1,9> {U}
               => "{\clearocplists\lowertuning[" \(* + 1 - 1) "]}"
             <= <push: OUT>
               ;
       {C} {ARG}<1,9>{COMMA}{ARG}<1,9>{SEMI}{ARG}<1,9>{COMMA}{ARG}<1,9> {C}
               => "{\clearocplists\uppertuning[" \(* + 1 - 1) "]}"
             <= <push: OUT>
               ;
       {U} {ARG}<1,9>{COMMA}{ARG}<1,9> {U}
               => "{\clearocplists\lowertune[" \(* + 1 - 1) "]}"
             <= <push: OUT>
               ;
       {C} {ARG}<1,9>{COMMA}{ARG}<1,9> {C}
               => "{\clearocplists\uppertune[" \(* + 1 - 1) "]}"
             <= <push: OUT>
               ;
%% An adjustment is a single number, with or without a decimal, which
%% represents the fraction of the width of the glyph by which the
%% accent is displaced.  It is much less powerful than a tuning, but
%% is sometimes useful.
%%
       {U} {ARG}<1,9> {U}      =>
               "{\clearocplists\loweradj{" \(* + 1 - 1) "}}"
             <= <push: OUT>
               ;
       {C} {ARG}<1,9> {C}      =>
               "{\clearocplists\upperadj{" \(* + 1 - 1) "}}"
             <= <push: OUT>
               ;

%% A `word' is a sequence of zero or more ` augmented letters' ended by
%% at least one regular letter (and terminated by a space, punct, etc).
%%
%% We use the sequence `)nullchar' and `nullchar(' as the end/start of
%% word markers.  (Nullchar=char254)
%%
       <IN> {A} end:   => \* `)' 254 % eow
                       ;
       <OUT>{A}        => 254 `(' % sow
                       <= \* <push: IN>
                       ;
       <IN>{NA}        => `)' 254 % eow
                       <= \* <push: OUT>
                       ;
       {A}             => 254 `(' % sow
                       <= \* <push: IN>
                       ;
       {NA}            =>
                       <= \* <push: OUT>
                       ;