#!/usr/pkg/bin/perl
#
use strict;
use warnings;
use HTML::Strip;
my $infile = shift;
my $outfile = shift;
$infile = "-" unless $infile;
$outfile = "STDOUT" unless $outfile;
my $hs = HTML::Strip->new(emit_spaces => 0);
my $text_ref = _slurp_file ($infile);
my $clean_text = $hs->parse($$text_ref);
_burp_file ($outfile, \$clean_text);
sub _slurp_file {
my $infile = shift;
open( my $fh, $infile ) or die "Unable to open $infile in _slurp_file: $!\n";
my $text = do { local( $/ ) ; <$fh> } ;
return \$text;
}
sub _burp_file {
my $outfile = shift;
my $text_ref = shift;
if ($outfile eq "STDOUT") {
print $$text_ref;
} else {
open( my $fh, ">$outfile" ) or die "Unable to open $outfile in _burp_file: $!\n" ;
print $fh $$text_ref ;
}
}
=head1 SYNOPSIS
unhtml is a perl script that strips HTML tags from text.
=head1 VERSION
This documentation describes version 1.3 of unhtml
=head1 DESCRIPTION
Uses HTML::Strip to do the real work; this is a wrapper around that
module that allows you to specify command line arguments - standard
input/output is assumed if no args are given. If only one arg is
given, it is assumed to be the input pathname.
=head1 USAGE
Examples (the following have equivalent results):
=over 4
=item unhtml < foo.html > foo.txt
=item unhtml foo.html > foo.txt
=item unhtml foo.html foo.txt
=back
=head1 REQUIRED ARGUMENTS
None. Acts as a STDIN/STDOUT pipe with no arguments.
=head1 OPTIONS
None.
=head1 DEPENDENCIES
Requires HTML::Strip (perl -MCPAN -e 'install HTML::Strip' as root on
any Unix-based OS will work).
=head1 LICENSE
Copyright (c) 2010
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see
<
http://www.gnu.org/licenses/>.
=cut