User:Joe's Olympic Bot/source2

use MediaWiki::API;
use Encode;
use LWP::UserAgent;
use utf8;

   # Gently pruned from the standard exclusion code to hardcode $user and $opt
   sub allowBots {
    my($text) = @_;

    my $user = "Joe's Olympic Bot";

    return 0 if $text =~ /{{[nN]obots}}/;
    return 1 if $text =~ /{{[bB]ots}}/;
    if($text =~ /{{[bB]ots\s*\|\s*allow\s*=\s*(.*?)\s*}}/s){
        return 1 if $1 eq 'all';
        return 0 if $1 eq 'none';
        my @bots = split(/\s*,\s*/, $1);
        return (grep $_ eq $user, @bots)?1:0;
    }
    if($text =~ /{{[bB]ots\s*\|\s*deny\s*=\s*(.*?)\s*}}/s){
        return 0 if $1 eq 'all';
        return 1 if $1 eq 'none';
        my @bots = split(/\s*,\s*/, $1);
        return (grep $_ eq $user, @bots)?0:1;
    }
    return 1;
  }

 
  # Within a single MediaWiki call, we ask the API to make up to 5 attempts, 10 s apart, until
  # the worst-case server lag is better than 5s. 
  my $mw = MediaWiki::API->new();
  $mw->{config}->{api_url} = 'http://en.wikipedia.org/w/api.php';

  # Delay/retry parameters
 
  $mw->{config}->{max_lag}         = 5;        # Tell MediaWiki to put us off it there's a 5s+ db lag out there
  $mw->{config}->{max_lag_delay}   = 10;  # ..and to wait 10s between retries
  $mw->{config}->{max_lag_retries} = 4;    # ..and to only make 4 retries before dropping back to our code

  # Our own delay parameters
  $standardelay      = 15;  
  $longdelay         = 900;  # ...if the API puts us off several times in a row, take a 15-minute break

  my $articles = null;

  # login
  while (1) { 
    if ($mw->login( { lgname => "Joe's Olympic Bot", lgpassword => '[REDACTED]' } )) {
      last;
    }

    if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) {
      sleep $longdelay;   
    } else {
      die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
    } 
  } 

  $profilesfound = 0;

  getsubd();



sub getsubd {

  while (1) {
    $sdirs = $mw->list ( {
       action => 'query',
       list => 'categorymembers',
       cmtitle => 'Category:Competitors at the 2012 Summer Olympics', 

       cmlimit => "max" },

      );  

    if ($articles) { last; }

    if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) {
      sleep $longdelay;   
    } else {
      die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
    } 
  }

  foreach (@{$sdirs}) {
    $sdirname = $_->{title};
    print "########### $sdirname\n";
    getlista($sdirname);
  }
}



sub getlista {

  my ($cata) = $_[0];

  # skip directories cleaned by hand already, why bother?

  # Get list of articles
  while (1) {
    $articles = $mw->list ( {
       action => 'query',
       list => 'categorymembers',
       cmtitle => $cata, 
       cmlimit => "max" },

        {  hook=> \&dsa	} ,
      );  

    if ($articles) { last; }

    if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) {
      sleep $longdelay;   
    } else {
      die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
    } 
  }
} 


sub dsa {

  my ($xyz) = $_[0];

  # scan through the articles...
  foreach (@{$xyz}) {

      my $thistitle = $_->{title};
      $listcount++;

# $thistitle = "User:Joe's Olympic Bot/Test";

#      print  "T: " . encode("iso-8859-1", $thistitle) . "\n";

      next if ($thistitle =~ m/^User:/);     
      next if ($thistitle =~ m/^Category:/);

      while (1) {
         my $pagehash = $mw->get_page( { title => $thistitle } );
         if ($pagehash) { last; }

         if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) {
            sleep $longdelay;   
          } else {
            die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
          } 
      }
     
      if (allowBots($pagehash->{'*'})) {       
              my $ref = $mw->get_page( { title => $thistitle } );
              $atext = decode_utf8 ( $ref->{'*'} );
              my $timestamp = $ref->{'timestamp'};

              # There are a couple articles which are not individual athletes but in these categories.  Restrict to living people
              if (!($atext =~ m/Category:Living people/)) {
                 print "NOTLIVING: " . encode("iso-8859-1", $thistitle) . "\n";
                 next;
              }


              if (($atext =~ m|\<ref\>\[http:\/\/www.london2012.com(\/)?[ '"][^<]+\<\/ref\>|) ||                   
                  ($atext =~ m|\<ref\>http:\/\/www.london2012.com(\/)?\<\/ref\>|)) {

  
                   print "BADREF: " . encode("iso-8859-1", $thistitle) . "\n";

                    $ret = findolympian($thistitle);
                    print " ->RESULT: " . encode("iso-8859-1", $ret) . "\n";

                    $striptitle = $thistitle;

                    if ($striptitle =~ m/([^(]+) \(/) {
                       $striptitle = $1;
                    }

                    $uastriptitle = $striptitle;
                    $uastriptitle =~ tr/ãâăáăäóěéíçćčÁúůřšșĽňńțśžŠ/aaaaaaoeeicccAuurssLnntszS/;


                    if (($londontitle ne $striptitle) && ($londontitle ne $uastriptitle)) {
                       print "GOFIX " . $striptitle . " as it doesn't match " . $londontitle . "\n";
                    } else {
                       print "AUTOFIX " . $striptitle . " seems to match " . $londontitle . "\n";



                    }
               }   # there's a bad URL

      } else {
        print "….DENIED\n";
      }
  }

  die if (profilesfound > 50);
}


sub findolympian {

  my ($olympian) = $_[0];

  my @o = split('\(', $olympian);
  $olympian = $o[0];
  $olympian =~ tr/ /+/;

  $u = 'http://www.london2012.com/search/index.htmx?q=' . $olympian;

# print "URL: " . $u . "\n";

$profilesfound++;

 my $ua = LWP::UserAgent->new;
 $ua->agent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0');
 $ua->default_header('Accept-Language' => "en-us,en;q=0.5");
 $ua->default_header('Accept' => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
 $ua->default_header('Connection' => "keep-alive");

 my $response = $ua->get($u);

 $html = $response->content;
 undef $londontitle; 

 undef $finishedurl;
 if (defined($html)) {
    if ($html =~ m#href=.(\/athlete\/[^\/]+\/)#       ) {
        $finishedurl = "http://www.london2012.com" . $1;

#  <span class="name">Felismina </span><span class="surname">Cavela </span>
       if ($html =~ m#"name"\>([^<]+)\<\/span\>\<span class="surname"\>([^<]+)\<\/span#) {
           $londontitle = decode_utf8($1). decode_utf8($2);

           $londontitle =~ s/\s+$//;

print "LONDONTITLE (" . $londontitle . ")\n";
        }
    } else {
       print "CANTFINDLINK\n" ;
       return $finishedurl;
    }
 } else {
   print "NORETURN\n" ;
 }

 return Encode::decode_utf8($finishedurl);
}







#if (0) {
#
#
 #                     $revtext = decode_utf8( $`)  
#
 #                           . decode_utf8("{{refn|name=OlympicBotGeneratedRef|{{cite web|title=") 
  #                          . decode_utf8($londontitle)
   #                         . decode_utf8("|url=") 
    #                        . decode_utf8($ret)
     #                       . decode_utf8("|work=London 2012|publisher=The London Organising Committee of the Olympic Games and Paralympic Games Limited|accessdate=15 September 2012}}}}")
      #                      . decode_utf8($'); 
       #                          
            #           $mw->edit( {
            #            action => 'edit',
            #            summary => "Joe's Olympic Bot: Correcting reference.",
            #            basetimestamp => $timestamp, # to avoid edit conflicts
            #            bot => 'true',
            #            title => $thistitle,
            #              basetimestamp => $timestamp, # to avoid edit conflicts
            #            text => $revtext   } )
            #           || die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
#}