#!/usr/bin/perl
###########################################
# googledrill - Explore and follow Google
# results
# Mike Schilli, 2002 (
[email protected])
###########################################
use warnings;
use strict;
use Net::Google;
use HTML::TreeBuilder;
use LWP::Simple;
use URI::URL;
my $GOOGLE_SEARCH = 'Schilli';
my $HIT_EXCL_PATTERN = qr(perlmeister\.com);
my $LINK_PATTERN = qr(perlmeister\.com);
my $RESULTS_PER_PAGE = 100;
my $RESULTS_TOTAL = 500;
use constant LOCAL_GOOGLE_KEY =>
"XXX_INSERT_YOUR_OWN_GOOGLE_KEY_HERE_XXX";
my $service = Net::Google->new(
key => LOCAL_GOOGLE_KEY,
);
my %links_seen = ();
my $hits_seen_total = 0;
while($hits_seen_total < $RESULTS_TOTAL) {
# Init search
my $session = $service->search(
max_results => $RESULTS_PER_PAGE,
starts_at => $hits_seen_total);
$session->query($GOOGLE_SEARCH);
# Contact Google for results
my @hits = @{($session->results())[0]};
# Iterate over results
for my $hit (@hits) {
my $url = norm_url($hit->URL());
# Eliminate unwanted sites
next if $url =~ $HIT_EXCL_PATTERN;
# Follow hit, retrieve site
print "Getting $url\n";
for my $link (get_links($url)) {
# Ignore self-links
next if $link !~ $LINK_PATTERN;
# Count link and push referrer
push @{$links_seen{$link}}, $url;
}
}
# Not enough results to continue?
last if @hits < $RESULTS_PER_PAGE;
$hits_seen_total += $RESULTS_PER_PAGE;
}
# Print results, highest counts first
for my $link (sort { @{$links_seen{$b}} <=>
@{$links_seen{$a}}
} keys %links_seen) {
print "$link (" .
scalar @{$links_seen{$link}}, ")\n";
for my $site (@{$links_seen{$link}}) {
print " $site\n";
}
}
###########################################
sub get_links {
###########################################
my($url) = @_;
my @links = ();
# Retrieve remote document
my $data = get($url);
if(! defined $data) {
warn "Cannot retrieve $url\n";
return @links;
}
# Extract <A HREF=...> links
my $tree = HTML::TreeBuilder->new();
$tree->parse($data);
my $ref = $tree->extract_links(qw/a/);
if($ref) {
@links = map { norm_url($_->[0])
} @$ref;
}
$tree->delete();
# Kick out dupes and return the list
my %dupes;
return grep { ! $dupes{$_}++ } @links;
}
###########################################
sub norm_url {
###########################################
my($url_string) = @_;
my $url = URI::URL->new($url_string);
return $url->canonical()->as_string();
}