User:Joe's Olympic Bot/source2
Appearance
use MediaWiki::API; use Encode; use LWP::UserAgent; use utf8; # Gently pruned from the standard exclusion code to hardcode $user and $opt sub allowBots { my($text) = @_; my $user = "Joe's Olympic Bot"; return 0 if $text =~ /{{[nN]obots}}/; return 1 if $text =~ /{{[bB]ots}}/; if($text =~ /{{[bB]ots\s*\|\s*allow\s*=\s*(.*?)\s*}}/s){ return 1 if $1 eq 'all'; return 0 if $1 eq 'none'; my @bots = split(/\s*,\s*/, $1); return (grep $_ eq $user, @bots)?1:0; } if($text =~ /{{[bB]ots\s*\|\s*deny\s*=\s*(.*?)\s*}}/s){ return 0 if $1 eq 'all'; return 1 if $1 eq 'none'; my @bots = split(/\s*,\s*/, $1); return (grep $_ eq $user, @bots)?0:1; } return 1; } # Within a single MediaWiki call, we ask the API to make up to 5 attempts, 10 s apart, until # the worst-case server lag is better than 5s. my $mw = MediaWiki::API->new(); $mw->{config}->{api_url} = 'http://en.wikipedia.org/w/api.php'; # Delay/retry parameters $mw->{config}->{max_lag} = 5; # Tell MediaWiki to put us off it there's a 5s+ db lag out there $mw->{config}->{max_lag_delay} = 10; # ..and to wait 10s between retries $mw->{config}->{max_lag_retries} = 4; # ..and to only make 4 retries before dropping back to our code # Our own delay parameters $standardelay = 15; $longdelay = 900; # ...if the API puts us off several times in a row, take a 15-minute break my $articles = null; # login while (1) { if ($mw->login( { lgname => "Joe's Olympic Bot", lgpassword => '[REDACTED]' } )) { last; } if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; } } $profilesfound = 0; getsubd(); sub getsubd { while (1) { $sdirs = $mw->list ( { action => 'query', list => 'categorymembers', cmtitle => 'Category:Competitors at the 2012 Summer Olympics', cmlimit => "max" }, ); if ($articles) { last; } if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; } } foreach (@{$sdirs}) { $sdirname = $_->{title}; print "########### $sdirname\n"; getlista($sdirname); } } sub getlista { my ($cata) = $_[0]; # skip directories cleaned by hand already, why bother? # Get list of articles while (1) { $articles = $mw->list ( { action => 'query', list => 'categorymembers', cmtitle => $cata, cmlimit => "max" }, { hook=> \&dsa } , ); if ($articles) { last; } if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; } } } sub dsa { my ($xyz) = $_[0]; # scan through the articles... foreach (@{$xyz}) { my $thistitle = $_->{title}; $listcount++; # $thistitle = "User:Joe's Olympic Bot/Test"; # print "T: " . encode("iso-8859-1", $thistitle) . "\n"; next if ($thistitle =~ m/^User:/); next if ($thistitle =~ m/^Category:/); while (1) { my $pagehash = $mw->get_page( { title => $thistitle } ); if ($pagehash) { last; } if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; } } if (allowBots($pagehash->{'*'})) { my $ref = $mw->get_page( { title => $thistitle } ); $atext = decode_utf8 ( $ref->{'*'} ); my $timestamp = $ref->{'timestamp'}; # There are a couple articles which are not individual athletes but in these categories. Restrict to living people if (!($atext =~ m/Category:Living people/)) { print "NOTLIVING: " . encode("iso-8859-1", $thistitle) . "\n"; next; } if (($atext =~ m|\<ref\>\[http:\/\/www.london2012.com(\/)?[ '"][^<]+\<\/ref\>|) || ($atext =~ m|\<ref\>http:\/\/www.london2012.com(\/)?\<\/ref\>|)) { print "BADREF: " . encode("iso-8859-1", $thistitle) . "\n"; $ret = findolympian($thistitle); print " ->RESULT: " . encode("iso-8859-1", $ret) . "\n"; $striptitle = $thistitle; if ($striptitle =~ m/([^(]+) \(/) { $striptitle = $1; } $uastriptitle = $striptitle; $uastriptitle =~ tr/ãâăáăäóěéíçćčÁúůřšșĽňńțśžŠ/aaaaaaoeeicccAuurssLnntszS/; if (($londontitle ne $striptitle) && ($londontitle ne $uastriptitle)) { print "GOFIX " . $striptitle . " as it doesn't match " . $londontitle . "\n"; } else { print "AUTOFIX " . $striptitle . " seems to match " . $londontitle . "\n"; } } # there's a bad URL } else { print "….DENIED\n"; } } die if (profilesfound > 50); } sub findolympian { my ($olympian) = $_[0]; my @o = split('\(', $olympian); $olympian = $o[0]; $olympian =~ tr/ /+/; $u = 'http://www.london2012.com/search/index.htmx?q=' . $olympian; # print "URL: " . $u . "\n"; $profilesfound++; my $ua = LWP::UserAgent->new; $ua->agent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0'); $ua->default_header('Accept-Language' => "en-us,en;q=0.5"); $ua->default_header('Accept' => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); $ua->default_header('Connection' => "keep-alive"); my $response = $ua->get($u); $html = $response->content; undef $londontitle; undef $finishedurl; if (defined($html)) { if ($html =~ m#href=.(\/athlete\/[^\/]+\/)# ) { $finishedurl = "http://www.london2012.com" . $1; # <span class="name">Felismina </span><span class="surname">Cavela </span> if ($html =~ m#"name"\>([^<]+)\<\/span\>\<span class="surname"\>([^<]+)\<\/span#) { $londontitle = decode_utf8($1). decode_utf8($2); $londontitle =~ s/\s+$//; print "LONDONTITLE (" . $londontitle . ")\n"; } } else { print "CANTFINDLINK\n" ; return $finishedurl; } } else { print "NORETURN\n" ; } return Encode::decode_utf8($finishedurl); } #if (0) { # # # $revtext = decode_utf8( $`) # # . decode_utf8("{{refn|name=OlympicBotGeneratedRef|{{cite web|title=") # . decode_utf8($londontitle) # . decode_utf8("|url=") # . decode_utf8($ret) # . decode_utf8("|work=London 2012|publisher=The London Organising Committee of the Olympic Games and Paralympic Games Limited|accessdate=15 September 2012}}}}") # . decode_utf8($'); # # $mw->edit( { # action => 'edit', # summary => "Joe's Olympic Bot: Correcting reference.", # basetimestamp => $timestamp, # to avoid edit conflicts # bot => 'true', # title => $thistitle, # basetimestamp => $timestamp, # to avoid edit conflicts # text => $revtext } ) # || die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; #}