Article 7717 of comp.lang.perl:
Xref: feenix.metronet.com comp.lang.perl:7717
Newsgroups: comp.lang.perl
Path: feenix.metronet.com!news.ecn.bgu.edu!usenet.ins.cwru.edu!howland.reston.ans.net!pipex!uunet!munnari.oz.au!metro!usage!news
From: [email protected] (Cameron Simpson)
Subject: Re: Stumped at To: header parsing
Message-ID: <cameron-931108235339-1-06469@fuligin>
To: [email protected] (Scott K. Hutton)
Followup-To: comp.lang.perl
Sender: [email protected]
Nntp-Posting-Host: fuligin.spectrum.cs.unsw.oz.au
Reply-To: [email protected]
Organization: CS&E Computing Facility, Uni Of NSW, Oz
References: <[email protected]>
Errors-To: [email protected]
Date: Mon, 8 Nov 1993 12:53:54 GMT
Return-Receipt-To: [email protected]
Lines: 330

| I'm at a loss for an efficient way to parse a nasty header in the form
| of:
|
|   To: [email protected] "Frank Oobar, Director", [email protected] (BIFF)
|
| The line needs to be split into its component addresses, but it should
| never be split on a comma that occurs within quotes or parens.  I
| can't figure out a pattern that will work for this and might be forced
| to scan the line for commas and figuring out if we're in quotes/parens
| or not.
|
| Surely someone out there has already invented this wheel...
|
| Be happy to share a smattering of other mail parsing code in return
| (or, for that matter, even if you can't help).  I'm working up a
| mail parsing package, since that's what I seem to do the most.
|
|  -Scott

I append my rfc822.pl package. Let me know if it uses things I hacen't
included, or of any bugs you find. What you want are &parseaddrs and
&rawaddrs.
       - Cameron Simpson
         [email protected], DoD#743
--
Hacker: One who accidentally destroys.
Wizard: One who recovers afterwards.

#!/bin/sh
#

sed 's/^X//' > rfc822.pl <<'EOF-/home/cs/spectrum/fuligin/1/cameron/etc/pl/cs/rfc822.pl'
X#!/usr/local/bin/perl
X#
X# Code to support RFC822-style message headers.
X#
X# &clrhdrs
X#      Empty %'hdrs and @'hdrs.
X#
X# &hdrkey  Field name to key for array.
X# &hdrnorm Field name to output form (capitalise words).
X#
X# &hdr(name) -> @values or undef
X#      Returns text of header as list (if accomodating multiple entries)
X#      or the list joined with "\t\n" or ", " if in scalar context.
X#
X# &addhdrs(@message_lines)
X#      Extract leading header lines from @lines up to blank line if present
X#      and add to @'hdrs. Then rebuild %'hdrs to be the content of each line
X#      keyed by downcased field name (and _ -> -). Multiple bodies are joined
X#      by ", " if in $rfc822'listfieldptn or by "\t\n" otherwise. Returns the
X#      unprocessed lines.
X#
X# &delhdrs(@field_names)
X#      Remove all references to the specified headers from @'hdrs and %'hdrs.
X#
X# &synchdrs
X#      Rebuild %'hdrs from @'hdrs.
X#
X# &parseaddrs($addresslist) -> @(precomment, address, postcomment)
X#      Break comma separated address list into a list of tuples,
X#      being leading comment, address portion, trailing comment.
X#
X# &rawaddrs(@(pre,addr,post)) -> @addrs
X#      Extract the middle elements from a 3-tuple list.
X#
X# &msgid
X#      Generate a message-id for an article.
X#
X
Xpackage rfc822;
X
X@mailhdrs=('to','cc','bcc','from','sender','reply-to','return-receipt-to',
X          'errors-to');
X@newshdrs=('newsgroups','followup-to');
X$mailptn=join('|',@mailhdrs);
X$newsptn=join('|',@newshdrs);
X
X@listhdrs=(@mailhdrs,@newshdrs,'keywords');
X$listfieldptn=join('|',@listhdrs);
X
X&clrhdrs;
X
Xsub clrhdrs
X       { undef %'hdrs, @'hdrs;
X         $synced=1;
X       }
X
Xsub hdr
X       { local($_)=@_;
X
X         $_=&hdrkey($_);
X
X         &synchdrs;
X
X         return undef unless defined($'hdrs{$_});
X
X         return $'hdrs{$_} unless wantarray;
X
X         local(@bodies)=();
X
X         for $hdr (@'hdrs)
X               { push(@bodies,$') if $hdr =~ /^$_:\s*/;
X               }
X
X         @bodies;
X       }
X
X# add a line to @hdrs and %hdrs
X# just adds to @hdrs until it gets "" or "\n" and then sets up %hdrs
X# field names matching $commaptn are concatenated with commas,
X# otherwise with "\n\t".
Xsub addhdrs    # @lines -> @remaining_lines
X       { local($commaptn)=$listfieldptn;
X         local($_,$hdr);
X
X         $hdr='';
X         while (defined($_=shift))
X               { s/\r?\n$//;
X                 last if !length;
X
X                 if (/^\s/)
X                       { $hdr.="\n$_";
X                       }
X                 else
X                 { length($hdr) && push(@'hdrs,$hdr);
X                   $hdr=$_;
X                 }
X               }
X
X         length($hdr) && push(@'hdrs,$hdr);
X
X         $synced=0;
X
X         @_;
X       }
X
X# delete all instances of specified headers
Xsub delhdrs    # (@fieldnames) -> void
X       { local(@fields)=@_;
X         local($field,@newhdrs);
X
X         for $field (@fields)
X               { $field=&hdrkey($field);
X                 @hdrs=eval 'grep(!/^$field:/o,@hdrs)';
X               }
X
X         $synced=0;
X       }
X
X# replace header lines
Xsub rephdrs    # (@headerlines) -> void
X       { local(@reps)=@_;
X         local($_,$field);
X
X         for (@reps)
X               { next unless /^([-\w]+):/;
X                 $field=&hdrkey($1);
X
X                 for $hdr (@'hdrs)
X                       { $hdr =~ s/^$field:/X-Original-$&/i;
X                       }
X               }
X
X         $synced=0;
X
X         &addhdrs(@reps);
X       }
X
X# Get key from field.
Xsub hdrkey
X       { local($_)=@_;
X         tr/_A-Z/-a-z/;
X         $_;
X       }
X
X# Get normal form of field name.
Xsub hdrnorm
X       { local($_)=&hdrkey($_[0]);
X
X         print STDERR "norm($_) -> ";
X         s/\b[a-z]/\u$&/g;
X         print STDERR "$_\n";
X         $_;
X       }
X
X# Rebuild %'hdrs from @'hdrs.
Xsub synchdrs   # (void) -> (void)
X       { return if $synced;
X
X         local($key,$field,$_);
X
X         undef %'hdrs;
X         for (@'hdrs)
X               { if (/^([^\s:]+):\s*/)
X                       { $key=$1;
X                         $field=$'; $field =~ s/^\s+//;
X                         $key=&hdrkey($key);
X                         if (defined($'hdrs{$key}))
X                               { if ($key =~ /^$commaptn$/o)
X                                       { $'hdrs{$key}.=', ';
X                                       }
X                                 else
X                                 { $'hdrs{$key}.="\n\t";
X                                 }
X
X                                 $'hdrs{$key}.=$field;
X                               }
X                         else
X                         { $'hdrs{$key}=$field;
X                         }
X                       }
X               }
X
X         $synced=1;
X       }
X
X# parse an RFC822 address list returning a list of tuples
X#      (leading comment, address, trailing comment, ...)
Xsub parseaddrs # (addrlist) -> @(precomment, addr, postcomment)
X       { local($_)=@_;
X         local(@parsed);
X         local($pre,$addr,$post)=('','','');
X
X         s/^\s+//;
X         while (length)
X               { if (/^,/)
X                       # end of currently building address
X                       { $_=$';
X                         if (length($pre) && !length($addr))
X                               { $addr=$pre; $pre='';
X                               }
X
X                         if (length($pre) || length($addr) || length($post))
X                               { push(@parsed,$pre,$addr,$post);
X                               }
X
X                         $pre='';
X                         $addr='';
X                         $post='';
X                       }
X                 elsif (!length($addr) && /^[-\w_.]+(@[-\w_.]+)?/)
X                       { $_=$';
X                         $addr=$&;
X                       }
X                 elsif (/^"([^"]|\\")*"/ || /^'([^']|\\')*'/)
X                       { $_=$';
X                         if (length($addr))
X                               { $post .= " $&";
X                               }
X                         else
X                         { $pre .= " $&";
X                         }
X                       }
X                 elsif (/^<[^>\s]*>/)
X                       { $_=$';
X                         if (length($addr))
X                               { $pre.=' '.$addr;
X                                 if (length($post))
X                                       { $pre.=' '.$post;
X                                         $post='';
X                                       }
X                               }
X
X                         $addr=$&;
X                       }
X                 elsif (/^[^,\s]+/)
X                       { $_=$';
X                         if (length($addr))
X                               { $post.=" $&";
X                               }
X                         else
X                         { $pre.=" $&";
X                         }
X                       }
X                 else
X                 { print STDERR "trouble parsing, remaining address is \"$_\"\n";
X                 }
X
X                 s/^\s+//;
X               }
X
X         if (length($pre) && !length($addr))
X               { $addr=$pre; $pre='';
X               }
X
X         if (length($pre) || length($addr) || length($post))
X               { push(@parsed,$pre,$addr,$post);
X               }
X
X         for (@parsed)
X               { s/^\s+//;
X               }
X
X         @parsed;
X       }
X
X# strip out the raw addresses from the result of &parseaddrs
Xsub rawaddrs
X       { local(@rawaddrs);
X         local($_);
X
X         while (defined($_=shift))
X               { $_=shift;
X                 last if !defined;
X                 s/^<([^>\s]+)>$/$1/;
X                 push(@rawaddrs,$_);
X                 shift;
X               }
X
X         @rawaddrs;
X       }
X
X$msgid_count=0;
Xsub msgid
X       { local($sec,$min,$hour,$mday,$mon,$year,@etc)=localtime(time);
X
X         $msgid_count++;
X         sprintf("<%s-%02d%02d%02d%02d%02d%02d-%d-%05d@%s>",
X               $'USER,
X               $year,$mon+1,$mday,$hour,$min,$sec,
X               $msgid_count,
X               $$,
X               $'HOSTNAME);
X       }
X
X1;     # for require
EOF-/home/cs/spectrum/fuligin/1/cameron/etc/pl/cs/rfc822.pl

exit 0