#!/usr/bin/perl
#
# Retrieve a subdivision place hierarchy from Maplandia
#
# Initialize
use warnings;
use strict;
use Benchmark;
use LWP::UserAgent;
my ($mdate);
use vars qw($DEBUG $BASEURL);
# Set the debug level
$DEBUG = 2;
# Get the time
$mdate = localtime;
# Set the base url
$BASEURL = "http://www.maplandia.com/";
# Where should we start?
my $PLACEURL = "http://www.maplandia.com/burma/";
my $BASETITLE = "Burma";
open OUT, "> ".$BASETITLE.".log";
print "[[".$BASETITLE."]]\n";
print OUT "[[".$BASETITLE."]]\n";
&spider_maplandia( $PLACEURL, "*" );
close(OUT);
sub get_http { # Retrieve a requested html page:
my ($this_url) = @_;
my ($useragent, $http_request, $useragent_result);
my ($EV_REDIR, $reurl, $xmlreurl);
$EV_REDIR =
"<META[\r\n\cM ]+HTTP-EQUIV=Refresh[\r\n\cM ]+"
."CONTENT=\"[\r\n\cM ]*[0-9]+;[\r\n\cM ]*"
."URL=([^\" ]*)[\r\n\cM ]*\"[\r\n\cM ]*>";
$useragent = new LWP::UserAgent;
$useragent->agent("Mozilla/5.0 (compatible; educational project)");
$useragent->timeout(60); # Timeout after 60 seconds
$http_request = new HTTP::Request GET => $this_url;
$useragent_result = $useragent->request($http_request);
if ($useragent_result->is_success) {
if ($useragent_result->content =~ /$EV_REDIR/i) {
$reurl = $1;
$xmlreurl = $reurl;
$xmlreurl =~ s/&/&/g;
print LOG "\nRedirected to $xmlreurl\n" if ($DEBUG >= 2);
return &get_http($reurl);
}
return $useragent_result->content;
} else {
print LOG "Could not get $this_url\n";
return "";
}
}
sub spider_maplandia {
my $topurl = shift @_;
my $indent = shift @_;
# Grab the page
my $in = &get_http($topurl);
# Preprocess the HTML
$in =~ tr/\r\n/ /d; # Compress into single line
$in =~ s/\cM//g; # Remove Ctrl-M's
$in =~ s/[ ]+/ /g; # Remove redundant spacing
$in =~ s/<\/?span[^<>]*>//gi; # Span
# Get the list of subplaces
if ($in =~ /<div class="rozdel">((?:<ul[^<>]*>|<\/ul>|<li[^<>]*>|<\/li>|<hr[^<>]*>|<a[^<>]*>|<\/a>|[^<>]*)*)<\/div>/gi) {
my $blist = $1;
foreach my $place ($blist =~ /<li>((?:<a[^<>]*>|<\/a>|[^<>]*)*)<\/li>/gi) {
if( $place =~ /<a[^<>]*href="([^" ]*)" [ ]*title="([^"<>]*)"[^<>]*>[^<>]*<\/a>/ ) {
my ($surl, $title) = ($1,$2);
$title =~ s/\[/(/g;
$title =~ s/\]/)/g;
$title =~ s/\"/"/g;
print $indent." [[".$title."]]\n";
print OUT $indent." [[".$title."]]\n";
sleep 2;
&spider_maplandia( $BASEURL.$surl, "*".$indent );
}
}
}
return;
}