Article 7717 of comp.lang.perl:
Xref: feenix.metronet.com comp.lang.perl:7717
Newsgroups: comp.lang.perl
Path: feenix.metronet.com!news.ecn.bgu.edu!usenet.ins.cwru.edu!howland.reston.ans.net!pipex!uunet!munnari.oz.au!metro!usage!news
From:
[email protected] (Cameron Simpson)
Subject: Re: Stumped at To: header parsing
Message-ID: <cameron-931108235339-1-06469@fuligin>
To:
[email protected] (Scott K. Hutton)
Followup-To: comp.lang.perl
Sender:
[email protected]
Nntp-Posting-Host: fuligin.spectrum.cs.unsw.oz.au
Reply-To:
[email protected]
Organization: CS&E Computing Facility, Uni Of NSW, Oz
References: <
[email protected]>
Errors-To:
[email protected]
Date: Mon, 8 Nov 1993 12:53:54 GMT
Return-Receipt-To:
[email protected]
Lines: 330
| I'm at a loss for an efficient way to parse a nasty header in the form
| of:
|
| To:
[email protected] "Frank Oobar, Director",
[email protected] (BIFF)
|
| The line needs to be split into its component addresses, but it should
| never be split on a comma that occurs within quotes or parens. I
| can't figure out a pattern that will work for this and might be forced
| to scan the line for commas and figuring out if we're in quotes/parens
| or not.
|
| Surely someone out there has already invented this wheel...
|
| Be happy to share a smattering of other mail parsing code in return
| (or, for that matter, even if you can't help). I'm working up a
| mail parsing package, since that's what I seem to do the most.
|
| -Scott
I append my rfc822.pl package. Let me know if it uses things I hacen't
included, or of any bugs you find. What you want are &parseaddrs and
&rawaddrs.
- Cameron Simpson
[email protected], DoD#743
--
Hacker: One who accidentally destroys.
Wizard: One who recovers afterwards.
#!/bin/sh
#
sed 's/^X//' > rfc822.pl <<'EOF-/home/cs/spectrum/fuligin/1/cameron/etc/pl/cs/rfc822.pl'
X#!/usr/local/bin/perl
X#
X# Code to support RFC822-style message headers.
X#
X# &clrhdrs
X# Empty %'hdrs and @'hdrs.
X#
X# &hdrkey Field name to key for array.
X# &hdrnorm Field name to output form (capitalise words).
X#
X# &hdr(name) -> @values or undef
X# Returns text of header as list (if accomodating multiple entries)
X# or the list joined with "\t\n" or ", " if in scalar context.
X#
X# &addhdrs(@message_lines)
X# Extract leading header lines from @lines up to blank line if present
X# and add to @'hdrs. Then rebuild %'hdrs to be the content of each line
X# keyed by downcased field name (and _ -> -). Multiple bodies are joined
X# by ", " if in $rfc822'listfieldptn or by "\t\n" otherwise. Returns the
X# unprocessed lines.
X#
X# &delhdrs(@field_names)
X# Remove all references to the specified headers from @'hdrs and %'hdrs.
X#
X# &synchdrs
X# Rebuild %'hdrs from @'hdrs.
X#
X# &parseaddrs($addresslist) -> @(precomment, address, postcomment)
X# Break comma separated address list into a list of tuples,
X# being leading comment, address portion, trailing comment.
X#
X# &rawaddrs(@(pre,addr,post)) -> @addrs
X# Extract the middle elements from a 3-tuple list.
X#
X# &msgid
X# Generate a message-id for an article.
X#
X
Xpackage rfc822;
X
X@mailhdrs=('to','cc','bcc','from','sender','reply-to','return-receipt-to',
X 'errors-to');
X@newshdrs=('newsgroups','followup-to');
X$mailptn=join('|',@mailhdrs);
X$newsptn=join('|',@newshdrs);
X
X@listhdrs=(@mailhdrs,@newshdrs,'keywords');
X$listfieldptn=join('|',@listhdrs);
X
X&clrhdrs;
X
Xsub clrhdrs
X { undef %'hdrs, @'hdrs;
X $synced=1;
X }
X
Xsub hdr
X { local($_)=@_;
X
X $_=&hdrkey($_);
X
X &synchdrs;
X
X return undef unless defined($'hdrs{$_});
X
X return $'hdrs{$_} unless wantarray;
X
X local(@bodies)=();
X
X for $hdr (@'hdrs)
X { push(@bodies,$') if $hdr =~ /^$_:\s*/;
X }
X
X @bodies;
X }
X
X# add a line to @hdrs and %hdrs
X# just adds to @hdrs until it gets "" or "\n" and then sets up %hdrs
X# field names matching $commaptn are concatenated with commas,
X# otherwise with "\n\t".
Xsub addhdrs # @lines -> @remaining_lines
X { local($commaptn)=$listfieldptn;
X local($_,$hdr);
X
X $hdr='';
X while (defined($_=shift))
X { s/\r?\n$//;
X last if !length;
X
X if (/^\s/)
X { $hdr.="\n$_";
X }
X else
X { length($hdr) && push(@'hdrs,$hdr);
X $hdr=$_;
X }
X }
X
X length($hdr) && push(@'hdrs,$hdr);
X
X $synced=0;
X
X @_;
X }
X
X# delete all instances of specified headers
Xsub delhdrs # (@fieldnames) -> void
X { local(@fields)=@_;
X local($field,@newhdrs);
X
X for $field (@fields)
X { $field=&hdrkey($field);
X @hdrs=eval 'grep(!/^$field:/o,@hdrs)';
X }
X
X $synced=0;
X }
X
X# replace header lines
Xsub rephdrs # (@headerlines) -> void
X { local(@reps)=@_;
X local($_,$field);
X
X for (@reps)
X { next unless /^([-\w]+):/;
X $field=&hdrkey($1);
X
X for $hdr (@'hdrs)
X { $hdr =~ s/^$field:/X-Original-$&/i;
X }
X }
X
X $synced=0;
X
X &addhdrs(@reps);
X }
X
X# Get key from field.
Xsub hdrkey
X { local($_)=@_;
X tr/_A-Z/-a-z/;
X $_;
X }
X
X# Get normal form of field name.
Xsub hdrnorm
X { local($_)=&hdrkey($_[0]);
X
X print STDERR "norm($_) -> ";
X s/\b[a-z]/\u$&/g;
X print STDERR "$_\n";
X $_;
X }
X
X# Rebuild %'hdrs from @'hdrs.
Xsub synchdrs # (void) -> (void)
X { return if $synced;
X
X local($key,$field,$_);
X
X undef %'hdrs;
X for (@'hdrs)
X { if (/^([^\s:]+):\s*/)
X { $key=$1;
X $field=$'; $field =~ s/^\s+//;
X $key=&hdrkey($key);
X if (defined($'hdrs{$key}))
X { if ($key =~ /^$commaptn$/o)
X { $'hdrs{$key}.=', ';
X }
X else
X { $'hdrs{$key}.="\n\t";
X }
X
X $'hdrs{$key}.=$field;
X }
X else
X { $'hdrs{$key}=$field;
X }
X }
X }
X
X $synced=1;
X }
X
X# parse an RFC822 address list returning a list of tuples
X# (leading comment, address, trailing comment, ...)
Xsub parseaddrs # (addrlist) -> @(precomment, addr, postcomment)
X { local($_)=@_;
X local(@parsed);
X local($pre,$addr,$post)=('','','');
X
X s/^\s+//;
X while (length)
X { if (/^,/)
X # end of currently building address
X { $_=$';
X if (length($pre) && !length($addr))
X { $addr=$pre; $pre='';
X }
X
X if (length($pre) || length($addr) || length($post))
X { push(@parsed,$pre,$addr,$post);
X }
X
X $pre='';
X $addr='';
X $post='';
X }
X elsif (!length($addr) && /^[-\w_.]+(@[-\w_.]+)?/)
X { $_=$';
X $addr=$&;
X }
X elsif (/^"([^"]|\\")*"/ || /^'([^']|\\')*'/)
X { $_=$';
X if (length($addr))
X { $post .= " $&";
X }
X else
X { $pre .= " $&";
X }
X }
X elsif (/^<[^>\s]*>/)
X { $_=$';
X if (length($addr))
X { $pre.=' '.$addr;
X if (length($post))
X { $pre.=' '.$post;
X $post='';
X }
X }
X
X $addr=$&;
X }
X elsif (/^[^,\s]+/)
X { $_=$';
X if (length($addr))
X { $post.=" $&";
X }
X else
X { $pre.=" $&";
X }
X }
X else
X { print STDERR "trouble parsing, remaining address is \"$_\"\n";
X }
X
X s/^\s+//;
X }
X
X if (length($pre) && !length($addr))
X { $addr=$pre; $pre='';
X }
X
X if (length($pre) || length($addr) || length($post))
X { push(@parsed,$pre,$addr,$post);
X }
X
X for (@parsed)
X { s/^\s+//;
X }
X
X @parsed;
X }
X
X# strip out the raw addresses from the result of &parseaddrs
Xsub rawaddrs
X { local(@rawaddrs);
X local($_);
X
X while (defined($_=shift))
X { $_=shift;
X last if !defined;
X s/^<([^>\s]+)>$/$1/;
X push(@rawaddrs,$_);
X shift;
X }
X
X @rawaddrs;
X }
X
X$msgid_count=0;
Xsub msgid
X { local($sec,$min,$hour,$mday,$mon,$year,@etc)=localtime(time);
X
X $msgid_count++;
X sprintf("<%s-%02d%02d%02d%02d%02d%02d-%d-%05d@%s>",
X $'USER,
X $year,$mon+1,$mday,$hour,$min,$sec,
X $msgid_count,
X $$,
X $'HOSTNAME);
X }
X
X1; # for require
EOF-/home/cs/spectrum/fuligin/1/cameron/etc/pl/cs/rfc822.pl
exit 0