User:OrphanBot/libPearle2.pl
Appearance
### IMPORTANT ### # This code is released into the public domain. ### RECENT CHANGES ### # 30 Nov 2005: Created, based off of the 12 Nov 2005 version of Pearle Wisebot # 15 Feb 2006: Modifed "retry" to work with any function that signals failure by dying, modified to use a simple exponential backoff formula # Simplified "limit" code, modified to take an optional parameter # Added "config" function as a clean interface to change internal parameters # Modified Wiki-access functions for use with the new "retry" function # Cleanup of boolean config vars to use standard Perl boolean conventions # 28 Feb 2006: Added checkLogin bottleneck, option to allow editing while logged out # Added support for proxy servers # 8 Mar 2006: Added support for getting a user's contributions # Added support for retrieving logs # Separated out some common regex parts into variables # 29 Mar 2006: Added protection against Unicode in URLs # Made thrown exceptions consistent # Sanity-checking on postPage: talkpage without article, userpage or user talkpage without user # 17 May 2005: Improved log retrieval # 12 Jul 2007: Added timestamp to information retrieved from logs # Errors thrown by this package always begin with a three-digit number # 4xx: HTTP client errors # 505: Server error: HTTP version not supported # 509: Server error: Bandwidth exceeded # # 900: Unspecified internal error. # 901: Library not initialized. You didn't call Pearle::init() before calling this function. # 902: Parameter error. You made a function call, but forgot a mandatory parameter, or provided an invalid one. # # 920: Unexpected response. The MediaWiki site returned something unexpected. # 921: Unexpected logout. The MediaWiki site logged us out unexpectedly. # 922: Edit conflict. Someone edited the article while we were. # 923: Deleted article conflict. Someone deleted the article while we were editing. package Pearle; use strict; use warnings; use Time::HiRes; use utf8; use LWP::UserAgent; use HTTP::Cookies; use HTTP::Request::Common qw(POST); use HTML::Entities; # Standard regex parts $Pearle::regex_timestamp = '(\d\d):(\d\d), (\d\d?) (\w+) (\d\d\d\d)'; # Match and capture a Wikipedia timestamp $Pearle::regex_timestamp_nc = '\d\d:\d\d, \d\d? \w+ \d\d\d\d'; # Match a Wikipedia timestamp #<a href="/w/index.php?title=User:Angel_dunn&action=edit" class="new" title="User:Angel dunn"> #<a href="/wiki/User:Jimbo_Wales" title="User:Jimbo Wales"> $Pearle::regex_pagelink = '<a href="[^"]*"(?: class="new"|) title="([^"]*)">'; # Match and capture any page $Pearle::regex_redpagelink = '<a href="[^"]*" class="new" title="([^"]*)">'; # Match and capture nonexistant pages only $Pearle::regex_bluepagelink = '<a href="[^"]*" title="([^"]*)">'; # Match and capture existing pages only $Pearle::regex_pagelink_nc = '<a href="[^"]*"(?: class="new"|) title="[^"]*">'; # Match any page $Pearle::regex_redpagelink_nc = '<a href="[^"]*" class="new" title="[^"]*">'; # Match nonexistant pages only $Pearle::regex_bluepagelink_nc = '<a href="[^"]*" title="[^"]*">'; # Match existing pages only # Standard MediaWiki namespaces @Pearle::namespaces = ("", "Talk", "User", "User talk", "Wikipedia", "Wikipedia talk", "Image", "Image talk", "MediaWiki", "MediaWiki talk", "Template", "Template talk", "Help", "Help talk", "Category", "Category talk"); $Pearle::logfile = ""; $Pearle::_inited = 0; $Pearle::username = ""; $Pearle::password = ""; $Pearle::speedLimit = 10; # Seconds to wait by default when limit() is called $Pearle::_speedMult = 1; # Multiplier for default wait time if the wiki is being slow $Pearle::roughMode = 0; # Ignore most errors $Pearle::nullOK = 0; # Permit editing non-existent pages $Pearle::sanityCheck = 0; # Sanity checking on edits $Pearle::silent = 0; # Silent mode $Pearle::quiet = 0; # Quiet mode $Pearle::logoutOK = 0; # Permit editing while logged out $Pearle::proxy = undef; # Proxy to use # This must be the first function from the library called sub init { $Pearle::username = $_[0] or die("902 No username provided!\n"); $Pearle::password = $_[1] or die("902 No password provided!\n"); $Pearle::logfile = $_[2] or die("902 No logfile name provided!\n"); $Pearle::cookies = $_[3] or die("902 No cookie file provided!\n"); $Pearle::useragent = $_[4] or $Pearle::useragent = "PearleLib/0.2"; $Pearle::ua = LWP::UserAgent->new(timeout => 300); $Pearle::ua->agent($Pearle::useragent); $Pearle::ua->cookie_jar(HTTP::Cookies->new(file => $Pearle::cookies, autosave => 1)); $Pearle::ua->cookie_jar->load(); $Pearle::roughMode = "no"; # Hot pipes $| = 1; $Pearle::_inited = 1; } sub config { my %params = @_; $Pearle::speedLimit = $params{speedLimit} if(defined($params{speedLimit})); $Pearle::roughMode = $params{roughMode} if(defined($params{roughMode})); $Pearle::nullOK = $params{nullOK} if(defined($params{nullOK})); $Pearle::silent = $params{silent} if(defined($params{silent})); $Pearle::quiet = $params{quiet} if(defined($params{quiet})); $Pearle::logfile = $params{logfile} if(defined($params{logfile})); $Pearle::logoutOK = $params{logoutOK} if(defined($params{logoutOK})); $Pearle::sanityCheck = $params{sanityCheck} if(defined($params{sanityCheck})); if(exists($params{proxy})) { if(defined($params{proxy})) { myPrint("Proxying: $params{proxy}\n"); myLog("Proxying: $params{proxy}\n"); $Pearle::ua->proxy('http', $params{proxy}); $Pearle::proxy = $params{proxy}; } else { myPrint("Not proxying\n"); myLog("Not proxying\n"); $Pearle::ua->no_proxy(); $Pearle::proxy = undef; } } } sub myLog { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); open (LOG, ">>", $Pearle::logfile) || die "900 Could not append to log!"; print LOG $_[0]; close (LOG); } sub myPrint { return if($Pearle::silent); return if($Pearle::quiet); print @_; } sub myErrPrint { return if($Pearle::silent); return if($Pearle::quiet); print STDERR @_; } # Rate-limiting. Can be sensibly run even if libPearle isn't initialized sub limit { my ($i); $i = ($_[0] or ($Pearle::speedLimit * $Pearle::_speedMult)); $i = 10 if($i < 10); # Rate-limiting to avoid hosing the wiki server # Min 30 sec unmarked # Min 10 sec marked # May be raised by retry() if load is heavy ### ATTENTION ### # Increasing the speed of the bot to faster than 1 edit every 10 # seconds violates English Wikipedia rules as of April, 2005, and # will cause your bot to be banned. So don't change $normalDelay # unless you know what you are doing. Other sites may have # similar policies, and you are advised to check before using your # bot at the default speed. ################# while ($i >= 0) { sleep (1); myErrPrint("Sleeping $i seconds...\r"); $i--; } myErrPrint(" \r"); } sub login { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); my $res = $Pearle::ua->post( "http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogin&action=submitlogin", Content => [ wpName => $Pearle::username, wpPassword => $Pearle::password, wpRemember => 1, wpLoginAttempt => 1 ] ); if( 302 == $res->code ) { myPrint("Logged in as $Pearle::username\n"); myLog("Logged in as $Pearle::username\n"); # This may or may not actually work $Pearle::ua->cookie_jar->save(); return 1; } else { myPrint("Login failed\n"); myPrint("Code: ".$res->code."\n"); myLog("Login failed\n"); return 0; } } sub logout { my $res = $Pearle::ua->post( "http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogout", ); return 1; } sub checkLogin { my ($reply_text); $reply_text = $_[0]; if ($reply_text !~ m/>My talk<\/a>/ and !($Pearle::logoutOK)) { # We've lost our identity. myLog ("Wiki server is not recognizing me.\n"); die ("921 Wiki server is not recognizing me.\n"); } } # Make an HTTP request, performing basic error checking and handling. Suitable for use with the "retry" function sub httpRequest { my ($request, $response, $attemptStartTime, $attemptEndTime); $request = $_[0]; # Since not every server handles UTF-8 in URLs, and LWP doesn't escape them properly, escape every character > 255 $request->uri(unicodeToUrl($request->uri())); $response = $Pearle::ua->request($request); # Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time(); if ($response->is_success or $response->is_redirect) { return $response } else { myLog ("HTTP ERR (".$response->status_line.")\n".$response->content."\n"); myPrint("HTTP ERR (".$response->status_line.")\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server. Can be remedied by waiting and trying again if (500 <= $response->code and 504 >= $response->code) { die("retry:".$response->status_line); } else { # Unhandled HTTP response. Waiting probably won't fix it die ($response->status_line."\n"); } } # Monitor wiki server responsiveness $attemptEndTime = Time::HiRes::time(); if($request->method() eq "POST") { if (($attemptEndTime - $attemptStartTime) > 20) { $Pearle::_speedMult = 60; myPrint("Wikipedia is very slow. Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n"); myLog("Wikipedia is very slow. Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n"); } # If the response time is between 10 and 20 seconds... elsif (($attemptEndTime - $attemptStartTime) > 10) { $Pearle::_speedMult = 6; myPrint("Wikipedia is somewhat slow. Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n"); myLog("Wikipedia is somewhat slow. Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n"); } # If the response time is less than 10 seconds... else { if ($Pearle::_speedMult != 1) { $Pearle::_speedMult = 1; myPrint( "Returning to normal minimum wait time.\n"); myLog("Returning to normal minimum wait time.\n"); } } } } # Check out a page for editing. sub getPage { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); my ($target, $request, $response, $reply, $text, $text2, $editTime, $startTime, $attemptStartTime, $attemptFinishTime, $token, $targetSafe); $target = $_[0]; if ($target =~ m/^\s*$/) { myLog("getPage: Null target."); die("902 getPage: Null target."); } $targetSafe = $target; $targetSafe =~ s/\&/%26/g; $targetSafe =~ s/\+/%2B/g; # Create a request-object myPrint("GET http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n"); myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit"); $response = startRetry(\&httpRequest, $request); $reply = $response->content; # This detects whether or not we're logged in. checkLogin($reply); # Check for blocking if($reply =~ /<h1 class="firstHeading">User is blocked<\/h1>/) { myLog("Blocked\n"); die("900 Blocked"); } $reply =~ m%<textarea\s+tabindex='1'\s+accesskey=","\s+name="wpTextbox1"\s+id="wpTextbox1"\s+rows='25'\s+cols='80'\s+>(.*?)</textarea>%s; $text = $1; $reply =~ m/value="(\d+)" name="wpEdittime"/; $editTime = $1; # Added 22 Aug 2005 to correctly handle articles that have # been undeleted $reply =~ m/value="(\d+)" name="wpStarttime"/; $startTime = $1; # Added 9 Mar 2005 after recent software change. $reply =~ m/value="([^"]+)" name="wpEditToken"/; $token = $1; ### if (($text =~ m/^\s*$/) and !$Pearle::nullOK) { myLog ("getPage($target): Null text!\n"); myLog ("\n---\n$reply\n---\n"); if ($Pearle::roughMode) { return; } else { die ("920 getPage($target): Null text!\n"); } } if (($editTime =~ m/^\s*$/) and !$Pearle::nullOK) { myLog ("getPage($target): Null time!\n"); myLog("\n---\n$reply\n---\n"); die ("920 getPage($target): Null time!\n"); } if (($text =~ m/>/) or ($text =~ m/</)) { myPrint($text); myLog("\n---\n$text\n---\n"); myLog ("getPage($target): Bad text suck!\n"); die ("920 getPage($target): Bad text suck!\n"); } # Change ( " -> " ) etc # This function is from HTML::Entities. decode_entities($text); # This may or may not actually work $Pearle::ua->cookie_jar->save(); return ($text, $editTime, $startTime, $token); } sub postPage { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); my ($request, $response, $pageName, $textToPost, $summaryEntry, $editTime, $startTime, $actual, $expected, $date, $editToken, $minor, $pageNameSafe); $pageName = $_[0]; $editTime = $_[1]; $startTime = $_[2]; $editToken = $_[3]; $textToPost = $_[4]; $summaryEntry = $_[5]; # Max 200 chars! $minor = $_[6]; $summaryEntry = substr($summaryEntry, 0, 200); if ($pageName eq "") { myLog ("postPage(): Empty pageName.\n"); die ("902 postPage(): Empty pageName.\n"); } if(!defined($minor)) { die "902 postPage(): Not enough parameters!\n"; } if ($summaryEntry eq "") { $summaryEntry = "Automated editing."; } $pageNameSafe = $pageName; $pageNameSafe =~ s/\&/%26/g; $pageNameSafe =~ s/\+/%2B/g; if ($minor eq "yes") { $request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageNameSafe}&action=submit", [wpTextbox1 => $textToPost, wpSummary => $summaryEntry, wpSave => "Save page", wpMinoredit => "on", wpEditToken => $editToken, wpStarttime => $startTime, wpEdittime => $editTime]; # Optional: wpWatchthis } else { $request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageNameSafe}&action=submit", [wpTextbox1 => $textToPost, wpSummary => $summaryEntry, wpSave => "Save page", wpEditToken => $editToken, wpStarttime => $startTime, wpEdittime => $editTime]; # Optional: wpWatchthis, wpMinoredit } # --- ## If posts are failing, you can uncomment the below to see what ## HTTP request is being made. # myLog($request->as_string()); # print $request->as_string(); $::speedLimit = 60 * 10; # print $::ua->request($request)->as_string; # --- myLog("POSTing..."); myPrint("POSTing..."); # Pass request to the user agent and get a response back $response = startRetry(\&httpRequest, $request); myLog("POSTed.\n"); myPrint("POSTed.\n"); if ($response->content =~ m/Please confirm that really want to recreate this article./) { myLog ($response->content."\n"); die ("923 Deleted article conflict! See log!"); } # Check the outcome of the response $response->code; if ($response->code != 302 and $response->code != 200) { myLog ("postPage(${pageName}, $editTime)#1 - expected =! actual\n"); myLog ($request->as_string()); myLog ("EXPECTED: 302'\n"); myLog (" ACTUAL: '" . $response->status_line . "'\n"); if ($Pearle::roughMode eq "yes") { return(); } else { die ("920 postPage(${pageName}, $editTime)#1 - expected =! actual - see log\n"); } } $expected = "http://en.wikipedia.org/wiki/${pageName}"; $expected = Pearle::urlEncode($expected); $actual = $response->headers->header("Location"); if (($expected ne $actual) and ($Pearle::roughMode ne "yes") and !(($actual eq "") and ($response->code == 200))) { myLog ("postPage(${pageName}, $editTime)#2 - expected =! actual\n"); myLog ("EXPECTED: '${expected}'\n"); myLog (" ACTUAL: '${actual}'\n"); die ("920 postPage(${pageName}, $editTime)#2 - expected =! actual - see log\n"); } if ($response->content =~ m/<h1 class="firstHeading">Edit conflict/) { myLog ("Edit conflict on '$pageName' at '$editTime'!\n"); die ("922 Edit conflict on '$pageName' at '$editTime'!\n"); } if($Pearle::sanityCheck and $pageName =~ /talk[ _]*:/i) # Check for accidental creation of a talkpage without a mainpage. Only works with bots using the "monobook" skin. { # Monobook:<li id="ca-nstab-main" class="new"><a href="/w/index.php?title=Kjsahfjrf&action=edit">Article</a></li> # Classic: <br /><a href="/w/index.php?title=Kjsahfjrf&action=edit" class="new" title="Kjsahfjrf">View article</a> if($response->content =~ /<li id="ca-nstab-[^"]" class="new">/) { myLog ("postPage(${pageName}) - Talkpage without article!\n"); die ("920 postPage(${pageName}) - Talkpage without article!\n"); } } if($Pearle::sanityCheck and $pageName =~ /^user[ _]*talk[ _]*:/) # Check for user talkpage for non-existant user { if($response->content !~ /User contributions/) { myLog ("postPge(${pageName}) - User talkpage for non-existant user!\n"); die ("920 postPge(${pageName}) - User talkpage for non-existant user!\n"); } } $Pearle::ua->cookie_jar->save(); return ($response->content); } # Get a list of the names of articles in a given category. sub getCategoryArticles { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); my ($target, $request, $response, $reply, $articles, $article, @articles, $targetSpace, $offset, $numberOfArticles, $url, @moreArticles); $target = $_[0]; $offset = $_[1]; # Need both _ and spaces for precise matching later $target =~ s/ /_/g; $targetSpace = $target; $targetSpace =~ s/_/ /g; unless ($target =~ m/^Category:/) { myLog ("getCategoryArticles(): Are you sure '$target' is a category?\n"); die ("902 getCategoryArticles(): Are you sure '$target' is a category?\n"); } if ($offset eq "") { $url = "http://en.wikipedia.org/wiki/${target}"; } else { $url = "http://en.wikipedia.org/w/index.php?title=${target}&from=${offset}"; } # Create a request-object if ($offset eq "") { myPrint("GET ${url}\n"); } myLog("GET ${url}\n"); $request = HTTP::Request->new(GET => "${url}"); $response = startRetry(\&httpRequest, $request); $reply = $response->content; # This detects whether or not we're logged in. checkLogin($reply); $articles = $reply; $articles =~ s%^.*?<h2>Articles in category.*?</h2>%%s; $articles =~ s%<div class="printfooter">.*?$%%s; @articles = $articles =~ m%<li><a href="/wiki/(.*?)" title=%sg; if ($reply =~ m/<a\s+href=\"\/w\/index.php\?title=${target}\&from=(.*?)\"\s+title=\"${targetSpace}\">next 200<\/a>/s) { sleep (1); # Throttle GETs @moreArticles = getCategoryArticles($target, $1); @articles = (@articles, @moreArticles); } $Pearle::ua->cookie_jar->save(); $numberOfArticles = @articles; if ($offset eq "") { myPrint("Got $numberOfArticles articles.\n"); myLog ("Got $numberOfArticles articles.\n"); } return decodeArray(@articles); } sub getCategoryImages { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); my ($target, $from, $request, $response, $reply, $images, @images, $image, %imagesHash); $target = $_[0]; $from = $_[1]; unless ($target =~ m/^Category:/) { myLog ("getCategoryImages(): Are you sure '$target' is a category?\n"); die ("902 getCategoryImages(): Are you sure '$target' is a category?\n"); } # Create a request-object if(!defined($from)) # Default: Start at the beginning of a category { myPrint("GET http://en.wikipedia.org/wiki/${target}\n"); myLog("GET http://en.wikipedia.org/wiki/${target}\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}"); } else # Start somewhere middle-ish { myPrint("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&from=$from\n"); myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&from=$from\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${target}\&from=$from"); } $response = startRetry(\&httpRequest, $request); $reply = $response->content; # This detects whether or not we're logged in. checkLogin($reply); # unless ($reply =~ m%<a href="/wiki/User_talk:$Pearle::username">My talk</a>%) # { # # We've lost our identity. # myLog ("Wiki server is not recognizing me (2).\n---\n${reply}\n---\n"); # die ("Wiki server is not recognizing me (2).\n"); # } $images = $reply; $images =~ s/^.*?<table class="gallery"//s; $images =~ s/<div class="printfooter">.*?$//s; @images = $images =~ m/<a\s+href="\/wiki\/(.*?)"\s+title=\"Image:/g; @images = grep {$_ =~ /^Image:/} @images; if($images =~ /&from=([^"]+)" title="Category:[^"]*">next 200/) { print "More: $1\n"; @images = (@images, getCategoryImages($target, $1)); } # Uniqify to prevent duplicates @images = uniquify(@images); $Pearle::ua->cookie_jar->save(); return decodeArray(@images); } sub getSubcategories { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); my ($target, $request, $response, $reply, $subcats, $subcat, @subcats, $attemptStartTime, $attemptFinishTime); $target = $_[0]; unless ($target =~ m/^Category:/) { myLog ("getSubcategories(): Are you sure '$target' is a category?\n"); die ("902 getSubcategories(): Are you sure '$target' is a category?\n"); } # Create a request-object myPrint("GET http://en.wikipedia.org/wiki/${target}\n"); myLog("GET http://en.wikipedia.org/wiki/${target}\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}"); $response = startRetry(\&httpRequest, $request); $reply = $response->content; # This detects whether or not we're logged in. checkLogin($reply); $subcats = $reply; if ($subcats =~ m%^.*?<h2>Subcategories</h2>(.*?)<h2>Pages in category.*?</h2>.*?$%s) { $subcats =~ s%^.*?<h2>Subcategories</h2>(.*?)<h2>Pages in category.*?</h2>.*?$%$1%s; } else { return (); } @subcats = $subcats =~ m%<li><a href="/wiki/(.*?)" title=%sg; $Pearle::ua->cookie_jar->save(); return decodeArray(@subcats); } # Get up to $max most recent articles edited by a user sub getUserArticles { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); my ($url, $request, $response, $reply, @contribs, $target, $namespace, $max, $offset); $target = $_[0]; $max = $_[1]; $offset = $_[2]; $namespace = namespaceToNumber($_[3]); # Create a request-object if(defined($namespace)) { $url = "http://en.wikipedia.org/w/index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}&namespace=$namespace"; } else { $url = "http://en.wikipedia.org/w/index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}"; } myPrint("GET $url\n"); myLog("GET $url\n"); $request = HTTP::Request->new(GET => "$url"); $response = startRetry(\&httpRequest, $request); $reply = $response->content; # This detects whether or not we're logged in. checkLogin($reply); # Extract the contributions # <li>23:18, 6 March 2006 (<a href="/w/index.php?title=User_talk:OrphanBot&action=history" title="User talk:OrphanBot"> while($reply =~ /<li>$Pearle::regex_timestamp_nc \($Pearle::regex_bluepagelink/g) { push @contribs, $1; } # Remove duplicates # @contribs = uniquify(@contribs); return @contribs; } # Gets a list of (articles, actor, summary) tuples from the specified log (upload, delete, move, protect) sub getLogArticles { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); my ($url, $request, $response, $reply, @articles, $log, $max, $offset, $user); $log = $_[0]; $max = $_[1] || 50; $offset = $_[2] || 0; $user = $_[3] || ''; # Create a request-object # http://en.wikipedia.org/w/index.php?title=Special:Log&type=upload&user=&page=&limit=2000&offset=0 $url = "http://en.wikipedia.org/w/index.php?title=Special%3ALog&limit=${max}&offset=${offset}&user=${user}&type=${log}"; myPrint("GET $url\n"); myLog("GET $url\n"); $request = HTTP::Request->new(GET => "$url"); $response = startRetry(\&httpRequest, $request); $reply = $response->content; # This detects whether or not we're logged in. checkLogin($reply); # Extract the articles #<li>19:55, 7 March 2006 <a href="/wiki/User:Jimbo_Wales" title="User:Jimbo Wales">Jimbo Wales</a> deleted "<a href="/w/index.php?title=Image:Justinsfriends.jpg&action=edit" class="new" title="Image:Justinsfriends.jpg">Image:Justinsfriends.jpg</a>" <span class='comment'>(blatant copyvio)</span> </li> #<li>19:54, 7 March 2006 <a href="/wiki/User:MrD9" title="User:MrD9">MrD9</a> moved <a href="/w/index.php?title=Statsoft&redirect=no" title="Statsoft">Statsoft</a> to <a href="/wiki/StatSoft" title="StatSoft">StatSoft</a> (<a href="/w/index.php?title=Special:Movepage&wpOldTitle=StatSoft&wpNewTitle=Statsoft&wpReason=revert&wpMovetalk=0" title="Special:Movepage">revert</a>)</li> #<li>19:53, 7 March 2006 <a href="/w/index.php?title=User:Biederman&action=edit" class="new" title="User:Biederman">Biederman</a> uploaded "<a href="/wiki/Image:Rockingham_Raymond_NH.PNG" title="Image:Rockingham Raymond NH.PNG">Image:Rockingham Raymond NH.PNG</a>" <span class='comment'>(Changed Image:Rockingham_Portsmouth_NH.PNG to highlight Raymond )</span> </li> #<li>19:31, 7 March 2006 <a href="/wiki/User:Francs2000" title="User:Francs2000">Francs2000</a> protected <a href="/wiki/Manoeuvre.org" title="Manoeuvre.org">Manoeuvre.org</a> <span class='comment'>({{deletedpage}} [edit=sysop:move=sysop])</span> </li> #<li>19:30, 7 March 2006 <a href="/wiki/User:Tony_Sidaway" title="User:Tony Sidaway">Tony Sidaway</a> unprotected <a href="/wiki/Will_McWhinney" title="Will McWhinney">Will McWhinney</a> <span class='comment'>(This looks like the protection that time forgot.)</span> </li> # while($reply =~ /<li>$Pearle::regex_timestamp_nc ${Pearle::regex_pagelink}.*?<\/a> (?:deleted|moved|uploaded|protected|unprotected) "?${Pearle::regex_pagelink}.*?<\/a>"(?:\s*<span class='comment'>(.*)<\/span>|)/g) while($reply =~ /<li>($Pearle::regex_timestamp_nc) ${Pearle::regex_pagelink}.*?<\/a> \(${Pearle::regex_pagelink_nc}Talk<\/a> \| ${Pearle::regex_pagelink_nc}contribs<\/a>\) (?:deleted|moved|uploaded|protected|unprotected) "?${Pearle::regex_pagelink}.*?<\/a>"(?:\s*<span class="comment">(.*)<\/span>|)/g) { my $summary = $3 || ''; push @articles, [$3, $2, $summary, $1]; } @articles = uniquify_ref1(@articles); return @articles; } # Use the Special:Export interface to get the wikitext of one or more articles sub Export { my ($request, $response, $reply, $articles); $articles = join "\n", @_; $request = POST "http://en.wikipedia.org/w/index.php?title=Special:Export&action=submit", [action => 'submit', pages => $articles, curonly => 1]; $response = startRetry(\&httpRequest, $request); $reply = $response->content; return $reply; } # Do a null edit to an article sub nullEdit { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); my ($text, $articleName, $comment, $editTime, $startTime, $token); $articleName = $_[0]; myPrint("nullEdit($articleName)\n"); myLog ("nullEdit($articleName)\n"); ($text, $editTime, $startTime, $token) = getPage($articleName); unless ($text eq "") { postPage ($articleName, $editTime, $startTime, $token, $text, "null edit"); } } # Get the history of an article and parse the first 500 entries into a list of [link day month year] lists sub parseHistory { my ($pageName, $html, @lines, $line, $date, $hour, $minute, $day, $month, $year, $htmlCopy, $link, $user, @result); $pageName = $_[0]; $pageName = escapeUrl($pageName); $html = getURL("http://en.wikipedia.org/w/index.php?title=${pageName}&action=history&limit=500"); $htmlCopy = $html; $html =~ s%^.*?<ul id="pagehistory">%%s; $html =~ s%(.*?)</ul>.*$%$1%s; @lines = split ("</li>", $html); foreach $line (@lines) { $line =~ s/\n/ /g; if ($line =~ m/^\s*$/) { next; } ($user) = $line =~ /<span class='history-user'><a href=[^>]*>([^<]*)/; $line =~ s/<span class='history-user'>.*?$//; $line =~ s/^.*?Select a newer version for comparison//; $line =~ s/^.*?Select a older version for comparison//; $line =~ s/^.*?name="diff" \/>//; $line =~ m%<a href="(.*?)" title="(.*?)">$Pearle::regex_timestamp</a>%; $link = $1; $hour = $3; $minute = $4; $day = $5; $month = $6; $year = $7; push @result, [$link, $day, $month, $year, $user]; } return (@result); } sub getURL #($target) { die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); # Read throttle! sleep (1); my ($request, $response, $reply, $url); $url = $_[0]; # Create a request-object myPrint("GET ${url}\n"); myLog("GET ${url}\n"); $request = HTTP::Request->new(GET => "${url}"); $response = startRetry(\&httpRequest, $request); $reply = $response->content; # This may or may not actually work $Pearle::ua->cookie_jar->save(); return ($reply); } # Retries a given function repeatedly, with an exponential backoff rate # The function should throw an exception beginning with "retry:" (case insensitive) if the call should be retried sub startRetry { my ($call_fn, @args) = @_; return retry($Pearle::speedLimit, $call_fn, @args); } sub retry { my ($call_fn, @args, $delay, @result, $result); ($delay, $call_fn, @args) = @_; if(wantarray()) { @result = eval{ $call_fn->(@args) }; if($@ =~ /^retry:/i) { limit($delay); @result = retry($delay * 2, $call_fn, @args); } elsif($@) { die; } return @result; } else { $result = eval{ &{$call_fn}(@args) }; if($@ =~ /^retry:/i) { limit($delay); $result = retry($delay * 2, $call_fn, @args); } elsif($@) { die; } return $result; } } sub namespaceToNumber { my $namespace = $_[0]; my $i = 0; my $name; if(defined($namespace)) { foreach $name (@Pearle::namespaces) { return $i if(lc($name) eq lc($namespace)); $i++; } } else { return undef; } } sub numberToNamespace { my $i = shift; if(defined($i)) { return $Pearle::namespaces[$i]; } else { return undef; } } # Translate from HTTP URL encoding to the native character set. sub urlDecode { my ($input); $input = $_[0]; $input =~ s/\%([a-f|A-F|0-9][a-f|A-F|0-9])/chr(hex($1))/eg; return ($input); } # Basic escaping of special characters in a URL sub escapeUrl { my $input = shift; $input =~ s/%/%25/g; $input =~ s/&/%26/g; $input = unicodeToUrl($input); return $input; } # URL-escape any high-unicode chars in a string sub unicodeToUrl { my ($char, $input, $output);; $input = $_[0]; foreach $char (split("",$input)) { if(ord($char) > 255) { $output .= uc(sprintf("%%%x%%%x", int(ord($char)/256), ord($char) & 0xFF)); # %HH%LL where HHLL is the hex code of $char } else { $output .= $char; } } return $output; } # Translate from the native character set to the Wikipedia HTTP URL encoding. sub urlEncode { my ($char, $input, $output); $input = $_[0]; foreach $char (split("",$input)) { # if ($char =~ m/[a-z|A-Z|0-9|\-_\.\!\~\*\'\(\)]/) # The below exclusions should conform to Wikipedia practice # (possibly non-standard) if ($char =~ m/[a-z|A-Z|0-9|\-_\.\/:]/) { $output .= $char; } elsif ($char eq " ") { $output .= "_"; } else { if(ord($char) > 255) { $output .= uc(sprintf("%%%x%%%x", int(ord($char)/256), ord($char) & 0xFF)); # %HH%LL where HHLL is the hex code of $char } else { $output .= uc(sprintf("%%%x", ord($char))); # %HH where HH is the hex code of $char } } } return ($output); } sub decodeArray { return map {urlDecode($_)} @_; } # Remove duplicates from a list sub uniquify { my @list = @_; @list = sort @list; my $last = undef; my @new_list; my $item; foreach $item (@list) { push @new_list, $item if(!defined($last) or ($item ne $last)); $last = $item; } return @new_list; } # Remove duplicates from a list of array references, grouping on the first subelement sub uniquify_ref1 { my @list = @_; @list = sort {$a->[0] cmp $b->[0]} @list; my $last = undef; my @new_list; my $item; foreach $item (@list) { push @new_list, $item if(!defined($last) or ($item->[0] ne $last)); $last = $item->[0]; } return @new_list; } 1;