Article 2366 of alt.sources:
Xref: feenix.metronet.com alt.sources:2366
Newsgroups: alt.sources
Path: feenix.metronet.com!news.utdallas.edu!hermes.chpc.utexas.edu!cs.utexas.edu!uunet!decwrl!decwrl!netcomsv!netcom.com!jolomo
From: [email protected] (Joe Morris)
Subject: newser - acc. news browser
Message-ID: <[email protected]>
Summary: Browses through saved News and Mail
Keywords: news mail perl
Organization: NETCOM On-line Communication Services (408 241-9760 guest)
X-Newsreader: TIN [version 1.1 PL8]
Date: Thu, 2 Dec 1993 23:31:07 GMT
Lines: 448

Perl program that treats those accumulated News and Mail files as
a sort of database.

     Please mail any modifications to [email protected]

#!/usr/bin/perl
#
# newser -- a News and Mail query browser. Joe Morris [email protected]
#
# $Id: newser,v 1.16 1993/12/02 20:20:10 joe Exp $
#
# SYNTAX
sub Usage {
   print "Usage: $0 [-f from_string] [-s subject_string] [search_string]\n";
   print "       Wrap double quotes around regex strings\n";
   exit(1);
}
#
# DESCRIPTION
#    The raw materials for this program are files where you've appended all
#    your online goodies. All those articles that you thought were worth
#    saving but now are taking 100MB of disk space and all you can do
#    is grep around in them. BLAH :(
#       NEWSER treats each of these files as a series of discreet messages,
#    you will be much better off if you just appended these things one after
#    the other, never editing the area which really defines a message ---
#    The Header. (Formal description of a "message" is in Notes section).
#    The USENET digests I've tried have worked fine.
#       With NEWSER, you can search on specific subjects, authors (From lines),
#    and things within the body of the message. A search for the word "From"
#    in the body will not match the "From"'s in headers, searches for
#    a specific subject will only match Subject lines in the header.
#       This program was prompted by a recent discussion on
#    alt.folklore.computers where it seemed that this sort of thing is
#    not readily available today -- so here's my 300 lines worth.
#
#    This is alpha, it's been run successfully on Linux(4MB ram :), SUN and AIX.
#
# ENVIRONMENT
#    NDIRS (news-dirs) a PATH-like variable to searched for your accumulated
#       articles and e-mail. Defaults to "$HOME/News:$HOME/Mail".
#    PAGER if unset or empty, defaults to "less".
#
# NOTES, Questions and Bugs
# 0. It runs about as quickly as you would expect. Perl experts please mail
#    me useful optimizations -- especially in got_a_match().
# 1. Probably am overusing global variables
# 2. Does anyone have a regex to handle most every "Date:" format that
#    shows up in News and Mail -- To do date ranges. I haven't really needed
#    this feature, is it useful?
# 3. What's the best way to implement ignore case matches -- with evals or
#    more conditional statements or something else? I work around this now
#    with regex's like:  $ newser -s "[Hh][Ee][Ll][Pp]"
# 4. Right now all search criteria are ANDed together.
# 5. Quitting the PAGER before search is done will abort the program.
# 6. How can I position the PAGER at the most recently viewed message?
# 7. If you don't already use "less", this would be a good time to learn it.
# 8. What is a News (or Mail) message:
#    a. Top of file or blank line
#    b. header -- at least three lines (can't have a blank line in between)
#       must be matched by patterns in handle_head_line().
#       You can have any number of these in the header
#       and any order is ok
#    c. A blank line
#    d. body of message -- optional
#       any number of lines with one possible fatal error:
#       a blank line followed by 3 lines matched by handle_head_line().
#       Any blank line before the magic "3" will recover from error.
# 9. Please mail me any rewrites or additions -- [email protected]
#

require "getopts.pl";
require "assert.ph";
require "flush.pl";

#$DEBUG="yes";

# default environment stuff
$home=$ENV{'HOME'};
$DEFDIRS = "$home/News:$home/Mail";
if (length($ENV{'NDIRS'}) == 0) { $ENV{'NDIRS'} = $DEFDIRS; }
$DEFPAGER = "less";
if (length($ENV{'PAGER'}) == 0) { $ENV{'PAGER'} = $DEFPAGER; }
$PAGER = $ENV{'PAGER'};

# main()
   #
   # Define scope of From && Subject lines (META support date ranges??)
   #
   &Getopts('f:s:');  # From, Subject

   # Must have at least one author, subject or string
   if (!$opt_f && !$opt_s) {
       $norm_scope = shift || &Usage();
   }
   $from_scope = $opt_f;
   $subj_scope = $opt_s;

   # Arbitrary number of possible search paths, probably way too big
   @dbdirs = split(/:/, $ENV{'NDIRS'}, 999);

   open (SHOWU, "|$PAGER");
   print SHOWU "Looking for: \n";
   $from_scope && print SHOWU "    From    lines matching $from_scope\n";
   $subj_scope && print SHOWU "    Subject lines matching $subj_scope\n";
   $norm_scope && print SHOWU "    Message lines matching $norm_scope\n";
   print SHOWU "\nSearching....\n\n";
   &flush(SHOWU);

   # Number of matching messages
   $match_count = 0;

   foreach $dbdir (@dbdirs) {
       opendir(DIR,$dbdir) || ((warn "Can't process $dbdir: $!\n"), next);
       @files = sort grep(!/^\./, readdir(DIR));
       closedir(DIR);
       foreach $dbfile (@files) {
           $infile = "$dbdir/$dbfile";
       &get_the_message();
       }
   }
   print SHOWU "Done Searching...Quit PAGER to continue\n";
   if ($match_count < 1) { print SHOWU "\n\nNo matches found\n\n"; }
   close (SHOWU);

   # Lovely user interface
   while (1) {

       # Grab ze response
       print "Which Message:";
       $worthy_input = 0;
       $choice = (<STDIN>);

       for ($i = 1; $i <= $match_count; $i++) {
           if ($choice == $i) {
               $worthy_input = 1;
               &activate_message($i);
               open (SHOWU, "|$PAGER");
               print SHOWU "MESSAGE $i\n\n";
               &display_message(SHOWU);
               close (SHOWU);
           }
       }

       # Did I read garbage???
       if ($worthy_input < 1) { print "Take it easy\n"; exit(0); }

       open (SHOWU, "|$PAGER");
       print SHOWU "Quit PAGER. Enter Message# or enter garbage to exit\n\n";
       for ($i = 1; $i <= $match_count; $i++) {
           &activate_message($i);
           printf (SHOWU "$i - %-30.30s %-40.40s\n",$From_Line,$Date_Line);
           printf (SHOWU "       %.70s\n",$Subj_Line);
           printf (SHOWU "       in file %.60s\n\n",$Filename);
       }
       close (SHOWU);
   }

# End Main


sub get_the_message {
   # Make sure it's a text file before searching for matches
   if ( -T $infile ) {
       open(INF,$infile) || ((warn "Can't process $infile: $!\n"), return());
       &got_a_match();
       close(INF);
   }
}


sub got_a_match {
#
# The evals used here I think are necessary for the optional case
# insensitivity -- would it be cheaper to have additional
# conditionals instead?
#
   #
   # Many times a file even starts with a header
   #
   $likely_header = 1;
   $in_header     = 0;
   $in_message    = 0;
   $blank_line    = 1;
   $real_top      = 0;
   $tmp_top       = 0;
   $bottom        = 1;

   while($cur_line = <INF>) {
       chop $cur_line;

       $DEBUG && print SHOWU "$likely_header.$in_header.$in_message.$blank_line.$tmp_top.$real_top.$bottom.$infile.",substr($cur_line,0,20),"\n";
       if ($cur_line =~ /\S+/) {
           if ($in_header < 1) {

               if ($likely_header > 1) {
               #
               # We've already got one match, this could be a header
               #
                   $blank_line = 0;

                   $key_word = &handle_head_line($cur_line);

                   if ($key_word ne "0") {

                       ++$likely_header;

                       if ($likely_header > 2) {
                       #
                       # assume this is a header and we're not
                       # in a message -- not likely anymore, it's fact
                       #
                           # Pop off any header lines at tail of message
                           while (pop(@this_message) =~ /\S+/) { ; }

                           $likely_header = 0;
                           $in_header = 1;
                           $in_message = 0;
                       }

                   }
               } elsif ($blank_line > 0 || $likely_header > 0) {
               #
               # had a blank line -- header??
               #
                   $blank_line = 0;
                   $key_word = &handle_head_line($cur_line);

                   if ($key_word ne "0") {
                   #
                   # got 1 match so far, save this line as top of
                   # header. Bump likeliness to stage two and clear
                   # out header holder
                   #
                       while ($temp_head[0] =~ /\S+/) {
                           shift(@temp_head);
                       }
                       ++$likely_header;
                       if ($likely_header == 1) {
                           $tmp_top  = tell - length($cur_line) - 1;
                           $bottom  = tell;
                       }

                   } else {
                   #
                   # Where are we? try message
                   #
                       #if ($likely_header < 1) { $in_message = 1; }
                   }
               } else {
                   ++$in_message;
                   $blank_line = 0;
               }
           } else {
               ++$in_header;
               $blank_line = 0;
           }
       } else {
       #
       # each blank line not following header is a harbinger of
       # another header -- set likely header to stage 1
       #
           if ($in_header > 0) {
               $in_header = 0;

               # Was previous message a keeper?
               if (&this_matches() > 0) { &light_match(); }
               undef @this_message;

               # New message header
               @this_message = @temp_head;
               undef @temp_head;
               # store a separator
               push(@this_message,"MESSAGE STARTS HERE");
               $in_message = 1;
               $blank_line = 0;
               $real_top = $tmp_top;
           } else {
               $blank_line = 1;
           }
       }

       # any header stuff this might push will get undef'd
       if ($in_message > 0 ) { push(@this_message,"$cur_line"); }

       # Save for header
       push(@temp_head,"$cur_line");
       $DEBUG && print SHOWU "$likely_header.$in_header.$in_message.$blank_line.$tmp_top.$real_top.$bottom.$infile.",substr($cur_line,0,20),"\n";

   }
   # Catch the last message of the file, if it's a keeper
   $bottom = tell;
   if (&this_matches() > 0) { &light_match(); }
   undef @this_message;
}


sub handle_head_line {
   local($head_line) = @_;
   $ret_val = "0";

   if      ($head_line =~ /^Crossposted-To:/) {
       $ret_val = "c";
   } elsif ($head_line =~ /^Date:/) {
       $ret_val = "d";
   } elsif ($head_line =~ /^From[: ]/) {
       $ret_val = "f";
   } elsif ($head_line =~ /^Message-ID:/) {
       $ret_val = "m";
   } elsif ($head_line =~ /^Received:/) {
       $ret_val = "r";
   } elsif ($head_line =~ /^Reply-To:/) {
       $ret_val = "a";
   } elsif ($head_line =~ /^Subject:/) {
       $ret_val = "s";
   } elsif ($head_line =~ /^To:/) {
       $ret_val = "t";
   } elsif ($head_line =~ /^X-Mailer:/) {
       $ret_val = "x";
   }
   return ($ret_val);
}


sub this_matches {
   # Assume we got a full match until we know we didn't
   $met_my_match   = 1;
   $now_in_message = 0;
   $from_match     = 0;
   $subj_match     = 0;
   $norm_match     = 0;

   for (@this_message) {
       /^MESSAGE STARTS HERE$/ && ($now_in_message = 1);
       $try_line = $_;

       if ($now_in_message < 1) {
           $m_type = &handle_head_line($try_line);
           if ($m_type eq "s" && ($this_subj = $try_line)) {
               if ($subj_scope && $try_line =~ /^Subject:.*$subj_scope/) {
                   $subj_match = 1;
               }
           } elsif ($m_type eq "f" && ($this_from = $try_line)) {
               $this_from =~ /^From[: ]*(\S*)/;
               $this_from = $1;
               if ($from_scope && $try_line =~ /^From[: ].*$from_scope/) {
                   $from_match = 1;
               }
           } else { ($m_type eq "d") && ($this_date = $try_line); }
       } elsif ($norm_scope && $try_line =~ /$norm_scope/) {
           $norm_match = 1;
       }
       $DEBUG && print SHOWU "MM.$now_in_message.$m_type.$from_scope.$from_match.$subj_scope.$subj_match.$norm_scope.$norm_match.$try_line\n";
   }

   # Make sure we got every match we needed
   if ($from_scope && $from_match < 1) { $met_my_match = 0; }
   if ($subj_scope && $subj_match < 1) { $met_my_match = 0; }
   if ($norm_scope && $norm_match < 1) { $met_my_match = 0; }
   $DEBUG && print SHOWU "MM. $met_my_match - non-Zero if I met my match\n";
   return ($met_my_match);
}


sub light_match {
#
# got a matching pattern -- must find limits of this messgage and load
# the "pseudononymous" :)
#
   ++$match_count;

   &activate_message($match_count);
       $Filename       = $infile;
       $Date_Line      = $this_date;
       $From_Line      = $this_from;
       $Subj_Line      = $this_subj;
       $TOM            = $real_top;
       $BOM            = $bottom;

   printf (SHOWU "$match_count - %-30.30s %-40.40s\n",$From_Line,$Date_Line);
   printf (SHOWU "       %.70s\n",$Subj_Line);
   printf (SHOWU "       in file %.60s\n\n",$Filename);
   &flush(SHOWU);
}


sub display_message {
   open(MFILE,$Filename) || ((warn "Can't process $Filename: $!\n"), return());
   seek(MFILE,$TOM,0);
   while(<MFILE>) {
       last if (tell(MFILE) > $BOM);
       push(@mess_ary,$_);
   }
   close(MFILE);

   # Pop off any header lines at tail of message
   while (pop(@mess_ary) =~ /\S+/) { ; }

   # Display to the PAGER pipe
   for (@mess_ary) { print SHOWU "$_"; }

   undef @mess_ary;
}

sub gensym { 'gensym_' . ++$gensym'symbol; }

sub activate_message {
#
# This was smilingly stolen from tchrist -- faq
#
   local($message) = @_;

   &assert('$message',$message);

   $Last_Seq = $Current_Seq;

   if (! defined $Active_Messages{$message}) {
       $Active_Messages{$message} = &gensym;
       push(@Active_Messages, $message);
   }

   local($package) = $Active_Messages{$message};

   local($code)=<<"EOF";
   {
       package $package;
       *'Message_ID        = *Message_ID;
       *'Filename          = *Filename;
       *'Date_Line         = *Date_Line;
       *'From_Line         = *From_Line;
       *'Subj_Line         = *Subj_Line;
       *'TOM               = *TOM;
       *'BOM               = *BOM;
       }
EOF
   eval $code;
   $Current_Seq = $message;

   &panic("bad eval: $@\n$code\n") if $@;
}



--
       -joe                 |  "We're all Bozos on this Bus"  --firesigns
                            |  "I'm suffering from software bloat!"
     Joe Morris             |  "New things must be allowed to come up instead
 [email protected]          |       of being slapped down" --Jane Jacobs