NAME
   "Text::Corpus::VoiceOfAmerica" - Make a corpus of VOA documents for
   research.

SYNOPSIS
     use Cwd;
     use File::Spec;
     use Data::Dump qw(dump);
     use Log::Log4perl qw(:easy);
     use Text::Corpus::VoiceOfAmerica;
     Log::Log4perl->easy_init ($INFO);
     my $corpusDirectory = File::Spec->catfile (getcwd(), 'corpus_voa');
     my $corpus = Text::Corpus::VoiceOfAmerica->new (corpusDirectory => $corpusDirectory);
     $corpus->update (testing => 1, verbose => 1);
     my $document = $corpus->getDocument (index => 0);
     dump $document->getBody;
     dump $document->getCategories;
     dump $document->getContent;
     dump $document->getDate;
     dump $document->getDescription;
     dump $document->getTitle;
     dump $document->getUri;

DESCRIPTION
   "Text::Corpus::VoiceOfAmerica" can be used to create a temporary corpus
   of Voice of America news documents for personal research and testing of
   information processing methods. Read the Voice of America's Terms of Use
   statement to ensure you abide by it when using this module.

   The categories, description, title, etc... of a specified document are
   accessed using Text::Corpus::VoiceOfAmerica::Document. Also, all errors
   and warnings are logged using Log::Log4perl, which should be
   initialized.

CONSTRUCTOR
 "new"
   The constructor "new" creates an instance of the
   "Text::Corpus::VoiceOfAmerica" class with the following parameters:

   "corpusDirectory"
        corpusDirectory => '...'

       "corpusDirectory" is the directory that documents are cached into
       using CHI. If "corpusDirectory" is not defined, then the path
       specified in the environment variable
       "TEXT_CORPUS_VOICEOFAMERICA_CORPUSDIRECTORY" is used if it is
       defined. If the directory defined does not exist, it will be
       created. A message is logged and an exception is thrown if no
       directory is specified.

METHODS
 "getDocument"
    getDocument (index => $documentIndex)
    getDocument (uri => $uri)

   "getDocument" returns a Text::Corpus::VoiceOfAmerica::Document object
   for the document with index $documentIndex or uri $uri. The document
   indices range from zero to "getTotalDocument()-1"; "getDocument" returns
   "undef" if any errors occurred and logs them using Log::Log4perl.

   For example:

     use Cwd;
     use File::Spec;
     use Data::Dump qw(dump);
     use Log::Log4perl qw(:easy);
     use Text::Corpus::VoiceOfAmerica;
     Log::Log4perl->easy_init ($INFO);
     my $corpusDirectory = File::Spec->catfile (getcwd(), 'corpus_voa');
     my $corpus = Text::Corpus::VoiceOfAmerica->new (corpusDirectory => $corpusDirectory);
     $corpus->update (testing => 1, verbose => 1);
     my $document = $corpus->getDocument (index => 0);
     dump $document->getBody;
     dump $document->getCategories;
     dump $document->getContent;
     dump $document->getDate;
     dump $document->getDescription;
     dump $document->getTitle;
     dump $document->getUri;

 "getTotalDocuments"
     getTotalDocuments ()

   "getTotalDocuments" returns the total number of documents in the corpus.
   The index to the documents in the corpus ranges from zero to
   "getTotalDocuments() - 1".

 "getURIsInCorpus"
    getURIsInCorpus ()

   "getURIsInCorpus" returns an array reference of all the URIs in the
   corpus.

   For example:

     use Cwd;
     use File::Spec;
     use Data::Dump qw(dump);
     use Log::Log4perl qw(:easy);
     use Text::Corpus::VoiceOfAmerica;
     Log::Log4perl->easy_init ($INFO);
     my $corpusDirectory = File::Spec->catfile (getcwd(), 'corpus_voa');
     my $corpus = Text::Corpus::VoiceOfAmerica->new (corpusDirectory => $corpusDirectory);
     $corpus->update (testing => 1, verbose => 1);
     dump $corpus->getURIsInCorpus;

 "update"
     update (verbose => 0)

   This method updates the set of documents in the corpus by fetching any
   newly listed documents in the "sitemap.xml" file.

   "verbose"
         verbose => 0

       If "verbose" is positive, then after each new document is fetched a
       message is logged stating the number of documents remaining to fetch
       and the approximate time to completion. "update" returns the number
       of documents fetched.

   "testing"
         testing => 0

       If "testing" is true, only one document is added to the corpus.

   For example:

     use Cwd;
     use File::Spec;
     use Data::Dump qw(dump);
     use Log::Log4perl qw(:easy);
     use Text::Corpus::VoiceOfAmerica;
     Log::Log4perl->easy_init ($INFO);
     my $corpusDirectory = File::Spec->catfile (getcwd(), 'corpus_voa');
     my $corpus = Text::Corpus::VoiceOfAmerica->new (corpusDirectory => $corpusDirectory);
     $corpus->update (testing => 1, verbose => 1);
     dump $corpus->getTotalDocuments;

EXAMPLES
   The example below will print out all the information for each document
   in the corpus.

     use Cwd;
     use File::Spec;
     use Data::Dump qw(dump);
     use Log::Log4perl qw(:easy);
     use Text::Corpus::VoiceOfAmerica;
     Log::Log4perl->easy_init ($INFO);
     my $corpusDirectory = File::Spec->catfile (getcwd(), 'corpus_voa');
     my $corpus = Text::Corpus::VoiceOfAmerica->new (corpusDirectory => $corpusDirectory);
     $corpus->update (testing => 1, verbose => 1);
     my $totalDocuments = $corpus->getTotalDocuments;
     for (my $i = 0; $i < $totalDocuments; $i++)
     {
       eval
         {
           my $document = $corpus->getDocument(index => $i);
           next unless defined $document;
           my %documentInfo;
           $documentInfo{title} = $document->getTitle();
           $documentInfo{body} = $document->getBody();
           $documentInfo{date} = $document->getDate();
           $documentInfo{content} = $document->getContent();
           $documentInfo{categories} = $document->getCategories();
           $documentInfo{description} = $document->getDescription();
           $documentInfo{uri} = $document->getUri();
           dump \%documentInfo;
         };
     }

INSTALLATION
   To install the module set "TEXT_CORPUS_VOICEOFAMERICA_FULL_TESTING" to
   true and run the following commands:

     perl Makefile.PL
     make
     make test
     make install

   If you are on a windows box you should use 'nmake' rather than 'make'.

   The module will install if "TEXT_CORPUS_VOICEOFAMERICA_FULL_TESTING" is
   not defined or false, but little testing will be performed.

AUTHOR
    Jeff Kubina<[email protected]>

COPYRIGHT
   Copyright (c) 2009 Jeff Kubina. All rights reserved. This program is
   free software; you can redistribute it and/or modify it under the same
   terms as Perl itself.

   The full text of the license can be found in the LICENSE file included
   with this module.

KEYWORDS
   information processing, english corpus, voa, voice of america

SEE ALSO
   CHI, Log::Log4perl, Text::Corpus::VoiceOfAmerica::Document