#!/usr/bin/perl
###########################################
# googledrill - Explore and follow Google
#               results
# Mike Schilli, 2002 ([email protected])
###########################################
use warnings;
use strict;

use Net::Google;
use HTML::TreeBuilder;
use LWP::Simple;
use URI::URL;

my $GOOGLE_SEARCH    = 'Schilli';
my $HIT_EXCL_PATTERN = qr(perlmeister\.com);
my $LINK_PATTERN     = qr(perlmeister\.com);
my $RESULTS_PER_PAGE = 100;
my $RESULTS_TOTAL    = 500;

use constant LOCAL_GOOGLE_KEY =>
"XXX_INSERT_YOUR_OWN_GOOGLE_KEY_HERE_XXX";

my $service = Net::Google->new(
   key   => LOCAL_GOOGLE_KEY,
);

my %links_seen       = ();
my $hits_seen_total  = 0;

while($hits_seen_total < $RESULTS_TOTAL) {
       # Init search
   my $session = $service->search(
       max_results => $RESULTS_PER_PAGE,
       starts_at   => $hits_seen_total);
   $session->query($GOOGLE_SEARCH);

       # Contact Google for results
   my @hits = @{($session->results())[0]};

       # Iterate over results
   for my $hit (@hits) {
       my $url = norm_url($hit->URL());

           # Eliminate unwanted sites
       next if $url =~ $HIT_EXCL_PATTERN;

           # Follow hit, retrieve site
       print "Getting $url\n";

       for my $link (get_links($url)) {

           # Ignore self-links
         next if $link !~ $LINK_PATTERN;

           # Count link and push referrer
         push @{$links_seen{$link}}, $url;
       }
   }

   # Not enough results to continue?
   last if @hits < $RESULTS_PER_PAGE;
   $hits_seen_total += $RESULTS_PER_PAGE;
}

   # Print results, highest counts first
for my $link (sort { @{$links_seen{$b}} <=>
                    @{$links_seen{$a}}
                  } keys %links_seen) {
   print "$link (" .
      scalar @{$links_seen{$link}}, ")\n";

   for my $site (@{$links_seen{$link}}) {
       print "    $site\n";
   }
}

###########################################
sub get_links {
###########################################
   my($url) = @_;

   my @links = ();

       # Retrieve remote document
   my $data = get($url);
   if(! defined $data) {
       warn "Cannot retrieve $url\n";
       return @links;
   }

      # Extract <A HREF=...> links
   my $tree = HTML::TreeBuilder->new();
   $tree->parse($data);
   my $ref = $tree->extract_links(qw/a/);
   if($ref) {
       @links = map { norm_url($_->[0])
                    } @$ref;
   }
   $tree->delete();

      # Kick out dupes and return the list
   my %dupes;
   return grep { ! $dupes{$_}++ } @links;
}

###########################################
sub norm_url {
###########################################
   my($url_string) = @_;

   my $url = URI::URL->new($url_string);
   return $url->canonical()->as_string();
}