User:Plastikspork/spider

From Wikipedia, the free encyclopedia
#!/usr/bin/perl
#
# Retrieve a subdivision place hierarchy from Maplandia
#

# Initialize
use warnings;
use strict;
use Benchmark;
use LWP::UserAgent;

my ($mdate);
use vars qw($DEBUG $BASEURL);

# Set the debug level
$DEBUG = 2;

# Get the time
$mdate = localtime;

# Set the base url
$BASEURL = "http://www.maplandia.com/";

# Where should we start?
my $PLACEURL = "http://www.maplandia.com/burma/";
my $BASETITLE = "Burma";

open OUT, "> ".$BASETITLE.".log";
print "[[".$BASETITLE."]]\n";
print OUT "[[".$BASETITLE."]]\n";
&spider_maplandia( $PLACEURL, "*" );
close(OUT);

sub get_http {    # Retrieve a requested html page:
  my ($this_url) = @_;
  my ($useragent, $http_request, $useragent_result);
  my ($EV_REDIR,  $reurl,        $xmlreurl);
  $EV_REDIR =
     "<META[\r\n\cM ]+HTTP-EQUIV=Refresh[\r\n\cM ]+"
    ."CONTENT=\"[\r\n\cM ]*[0-9]+;[\r\n\cM ]*"
    ."URL=([^\" ]*)[\r\n\cM ]*\"[\r\n\cM ]*>";
  $useragent = new LWP::UserAgent;
  $useragent->agent("Mozilla/5.0 (compatible; educational project)");
  $useragent->timeout(60);    # Timeout after 60 seconds

  $http_request = new HTTP::Request GET => $this_url;
  $useragent_result = $useragent->request($http_request);
  if ($useragent_result->is_success) {
    if ($useragent_result->content =~ /$EV_REDIR/i) {
      $reurl    = $1;
      $xmlreurl = $reurl;
      $xmlreurl =~ s/&/&/g;
      print LOG "\nRedirected to $xmlreurl\n" if ($DEBUG >= 2);
      return &get_http($reurl);
    }
    return $useragent_result->content;
  } else {
    print LOG "Could not get $this_url\n";
    return "";
  }
}

sub spider_maplandia {
  my $topurl = shift @_;
  my $indent = shift @_;

  # Grab the page
  my $in = &get_http($topurl);

  # Preprocess the HTML
  $in =~ tr/\r\n/  /d;    # Compress into single line
  $in =~ s/\cM//g;        # Remove Ctrl-M's
  $in =~ s/[ ]+/ /g;      # Remove redundant spacing
  $in =~ s/<\/?span[^<>]*>//gi; # Span

  # Get the list of subplaces
  if ($in =~ /<div class="rozdel">((?:<ul[^<>]*>|<\/ul>|<li[^<>]*>|<\/li>|<hr[^<>]*>|<a[^<>]*>|<\/a>|[^<>]*)*)<\/div>/gi) {
     my $blist = $1;
     foreach my $place ($blist =~ /<li>((?:<a[^<>]*>|<\/a>|[^<>]*)*)<\/li>/gi) {
        if( $place =~ /<a[^<>]*href="([^" ]*)" [ ]*title="([^"<>]*)"[^<>]*>[^<>]*<\/a>/ ) {
           my ($surl, $title) = ($1,$2);
           $title =~ s/\[/(/g;
           $title =~ s/\]/)/g;
           $title =~ s/\"/"/g;
           print $indent." [[".$title."]]\n";
           print OUT $indent." [[".$title."]]\n";
           sleep 2;
           &spider_maplandia( $BASEURL.$surl, "*".$indent );
        }
     }
  }

  return;
}