Path: usenet.cise.ufl.edu!huron.eel.ufl.edu!usenet.eel.ufl.edu!nntpX.primenet.com!nntp.primenet.com!news.idt.net!WCG!cyclone.i1.net!uunet!in1.uu.net!news.neta.com!not-for-mail
From: [email protected] (Jari Aalto+mail.perl)
Newsgroups: comp.lang.perl.announce,comp.lang.perl.modules
Subject: v1998.1008 Squeeze.pm -- Shorten text to minimum syllables
Followup-To: comp.lang.perl.modules
Date: 26 Nov 1998 03:33:32 GMT
Organization: University of Tampere
Lines: 210
Approved: [email protected] (comp.lang.perl.announce)
Message-ID: <[email protected]>
NNTP-Posting-Host: gadget.cscaper.com
X-Trace: news.neta.com 912051212 14630 206.67.186.3 (26 Nov 1998 03:33:32 GMT)
X-Complaints-To: [email protected]
NNTP-Posting-Date: 26 Nov 1998 03:33:32 GMT
X-Disclaimer: The "Approved" header verifies header information for article transmission and does not imply approval of content.
Xref: usenet.cise.ufl.edu comp.lang.perl.announce:171 comp.lang.perl.modules:6285



Download

       ANNOUNCE: v1998.1008 Squeeze.pm -- Shorten text to minimum syllables
       The version number is based on ISO date format YYYY.MMDD.

       Home page:

           (eg. ftp://ftp.funet.fi/pub/languages/perl/CPAN/)
           CPAN//modules/by-module/Lingua/

       Perl language interpreter pointers at (Win32/Unix etc.)
       Perl: http://language.perl.com/info/software.html


       A module that I use to compress text from email before it is
       sent to my Cellular phone. If you have a pager, you know how
       tight the space is and every extra characters saver is a plus.

       A shortened POD page follows. The Module's Interface functions
       and interface variables are not included in this announcement.

       I would welcome more text compresion rules, so feel free to
       suggest more hash entries like:

               WORD       => CONVERSION
               MULTI WORD => CONVERSION

NAME
   Squeeze.pm - Shorten text to minimum syllables by using hash and vowel
   deletion

REVISION
   $Id: Squeeze.pm,v 1.24 1998/10/08 14:58:15 jaalto Exp $

SYNOPSIS
       use Squeeze.pm;         # imnport only function
       use Squeeze qw( :ALL ); # import all functions and variables
       use English;

       while (<>)
       {
           print SqueezeText $ARG;
       }

DESCRIPTION
   Squeeze text (English) to most compact format possibly so that it is
   barely readable. You shold convert all text to lowercase for maximum
   compression, because optimisations have been designed mostly fr
   unpapitalised letters.

       `Warning: Each line is processed multiple times, so prepare for slow
       conversion time'

   You can use this module eg to preprocess text before it is sent to
   electronic media that has maximum text size limit. For example Pagers
   have some arbitrary text size limit, say 200 characters, which you want
   to fill as much as possible. Alternatively you may have GSM Cellular
   phone wich is capable of receiving Short Messages (SMS), whose text
   limit is 160 characters. To your amusement, the description text of this
   paragraph has been converted below using this library's SqueezeText()
   function . See yourself if it's readable (Yes, it takes some time to get
   used to). The compress ratio is typically 30-40%

       u _n use thi mod to prprce txt bfre i_s snt to
       elrnic mda t_hs max txt siz lim. f_xmple Pag hv
       som abitry txt siz lim, say 200 chr, W/ u wnt to fll
       as mch as psbleAlternatvly u may hv GSM Cllar PH wch is
       cpble of rcivng Short msg (SMS), WS/ txt lim is 160
       chrTo u/ amsment, dsc txt of thi prgra has
       ben cnv_ blow usng thi lbrrys SquezText() fnc See
       uself if i_s redble (Yes, it tak som T to get usdto
       compr rati is typcly 30-40

   There are few grammar rules which are used to shorten some English
   tokens very much:

       Word that has _ is usually a verb

       Word that has / is usually a substantive, noun,
                       pronomine or other non-verb

   For example, these tokens must be understood before text can be read.
   This is not yet like Geek code, because you don't need external parser
   to understand this, but just some common sense and time to adapt
   yourself to this text. *For a complete up to date list, you have to peek
   the source code*

       automatically => 'acly_'

       for           => 4
       for him       => 4h
       for her       => 4h
       for them      => 4t
       for those     => 4t

       can           => _n
       does          => _s

       it is         => i_s
       that is       => t_s
       which is      => w_s
       that are      => t_r
       which are     => w_r

       less          => -/
       more          => +/
       most          => ++

       however       => h/ver
       think         => thk_

       useful        => usful

       you           => u
       your          => u/
       you'd         => u/d
       you'll        => u/l
       they          => t/
       their         => t/r

       will          => /w
       would         => /d
       with          => w/
       without       => w/o
       which         => W/
       whose         => WS/

   Time is expressed with big letters

       time          => T
       minute        => MIN
       second        => SEC
       hour          => HH
       day           => DD
       month         => MM
       year          => YY

   Other Big letter acronyms

       phone         => PH

EXAMPLES
   To add new words e.g. to word conversion hash table, you'd define your
   custom set and merge them to existing ones. Do similarly to
   `%SQZ_WXLATE_MULTI_HASH' and `$SQZ_ZAP_REGEXP' and then start using the
   conversion function.

       use English;
       use Squeeze qw( :ALL );

       my %myExtraWordHash =
       (
             new-word1  => 'conversion1'
           , new-word2  => 'conversion2'
           , new-word3  => 'conversion3'
           , new-word4  => 'conversion4'
       );

       #   First take the existing tables and merge them with my
       #   translation table

       my %mySustomWordHash =
       (
             %SQZ_WXLATE_HASH
           , %SQZ_WXLATE_EXTRA_HASH
           , %myExtraWordHash
       );
       my $myXlat = 0;                             # state flag

       while (<>)
       {
           if ( $condition )
           {
               SqueezeHashSet \%%mySustomWordHash; # Use MY conversions
               $myXlat = 1;
           }

           if ( $myXlat and $condition )
           {
               SqueezeHashSet "reset";             # Back to default table
               $myXlat = 0;
           }

           print SqueezeText $ARG;
       }

   Similarly you can redefine the multi word thanslate table by supplying
   another hash reference in call to SqueezeHashSet(), and to kill more
   text immediately in addtion to default, just concatenate the regexps to
   *$SQZ_ZAP_REGEXP*


KNOWN BUGS
   There may be lot of false conversions and if you think that some word
   squeezing went too far, please turn on the debug end send the log to the
   maintainer. To see how the conversion goes e.g. for word *Messages*:

       use English;
       use Lingua::EN:Squeeze;

       SqueezeDebug( 1, '(?i)Messages' );

       $ARG = "This line has some Messages in it";
       print SqueezeText $ARG;

END