#!/usr/pkg/bin/perl
#
use strict;
use warnings;

my $infile = shift;
my $outfile = shift;

$infile = "-" unless $infile;
$outfile = "STDOUT" unless $outfile;

my $text_ref = _slurp_file ($infile);

# Strip HTML in 3 stages, from http://www.perlmonks.org/?node_id=46815
$$text_ref =~ s/<!--.*?-->//g;
while ( $$text_ref =~ s/<(?!--)[^'">]*"[^"]*"/</g
             or $$text_ref =~ s/<(?!--)[^'">]*'[^']*'/</g ) {};
$$text_ref =~ s/<(?!--)[^">]*>//g;

_burp_file ($outfile, $text_ref);

sub _slurp_file {
 my $infile = shift;
 open( my $fh, $infile ) or die "Unable to open $infile in _slurp_file: $!\n";
 my $text = do { local( $/ ) ; <$fh> } ;
 return \$text;
}

sub _burp_file {
 my $outfile = shift;
 my $text_ref = shift;

 if ($outfile eq "STDOUT") {
   print $$text_ref;
 } else {
   open( my $fh, ">$outfile" ) or die "Unable to open $outfile in _burp_file: $!\n" ;
   print $fh $$text_ref ;
 }
}

=head1 SYNOPSIS

unhtml is a perl script that strips HTML tags from text.

=head1 VERSION

This documentation describes version 1.3 of unhtml

=head1 DESCRIPTION

Uses a few regexes to do the real work of stripping HTML tags; this is not
the best solution, but works in most cases, and is free of any module
dependencies. You can specify command line file arguments - standard
input/output is assumed if no args are given. If only one arg is given, it
is assumed to be the input pathname.

=head1 USAGE

Examples (the following have equivalent results):

=over 4

=item unhtml < foo.html > foo.txt

=item unhtml foo.html > foo.txt

=item unhtml foo.html foo.txt

=back

=head1 REQUIRED ARGUMENTS

None. Acts as a STDIN/STDOUT pipe with no arguments.

=head1 OPTIONS

None.

=head1 LICENSE

Copyright (c) 2010, 2011 [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see
<http://www.gnu.org/licenses/>.

=cut