User:Lar/ClassificationTableGen
Old version at: User:Lar/ClassificationTableGen/Backlev
Perl code: This code generated User:Lar/Sandbox2 (version 8).. There is a lot of work to do on it yet but if you stumble across this, feedback welcome. Not ready for public release yet (if ever).
Signature shows update time/date ++Lar: t/c 03:34, 27 April 2006 (UTC)
note: requires perl 5.8.... only tested on Wintel platform.
Helper module, based on Pearle Wisebot code
[edit]... this module needs some better commenting and trimming out of code not needed for what I need. It ALSO needs the user/cookie/token stuff made generic. It is hard coded to use my userid so may not work for anyone else (no you can't have my cookie file. Read the User:Pearle pages on how to set up your own and look for Lar in there and change it to your userid.
Filename WP_util_pearlebot.pm
package WP_util_pearlebot; # assumes WP_util_pearlebot.pm # based on boilerplate module declaration found here: # http://perldoc.perl.org/perlmod.html#Perl-Modules-module # # based on code that is part of the "Pearle Wisebot" # http://en.wikipedia.org/wiki/User:Pearle # http://en.wikipedia.org/wiki/User:Pearle/pearle-documentation.txt # http://en.wikipedia.org/wiki/User:Pearle/pearle.pl # which was created by [[User:Beland]]: # http://en.wikipedia.org/wiki/User:Beland # Mods by Larry Pieniazek ( [[user:Lar]] ) use strict; use warnings; use Data::Dumper; use Getopt::Std; use Time::HiRes; use utf8; #use encoding 'utf8'; # Initialization use LWP; use HTTP::Cookies; use HTTP::Request::Common qw(POST); use HTML::Entities; BEGIN { use Exporter (); our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS); # set the version for version checking $VERSION = 1.00; # if using RCS/CVS, this may be preferred # $VERSION = sprintf "%d.%03d", q$Revision: 1.1 $ =~ /(\d+)/g; @ISA = qw(Exporter); @EXPORT = qw(&func1 &func2 &func4); %EXPORT_TAGS = ( ); # eg: TAG => [ qw!name1 name2! ], # your exported package globals go here, # as well as any optionally exported functions # @EXPORT_OK = qw($Var1 %Hashit &func3); # we do not have any externals } our @EXPORT_OK; # exported package globals go here # our $Var1; # our %Hashit; # non-exported package globals go here # initialize package globals, first exported ones # then the others (which are still accessible as $WP_util_pearlebot::stuff) # all file-scoped lexicals must be created before # the functions below that use them. # file-private lexicals go here # here's a file-private function as a closure, # callable as &$priv_func; it cannot be prototyped. # my $priv_func = sub { # stuff goes here. # }; # make all your functions, whether exported or not; # remember to put something interesting in the {} stubs sub myLog; sub getPage; sub postPage; sub retry; sub printWikitext; sub test; END { } # module clean-up code here (global destructor) ## YOUR CODE GOES HERE # LWP:UserAgent is a library which allows us to create a "user agent" # object that handles the low-level details of making HTTP requests. $::ua = LWP::UserAgent->new(timeout => 300); $::ua->agent("LarUtil/0.1"); $::ua->cookie_jar(HTTP::Cookies->new(file => "cookies.lar.txt", autosave => 1)); # $::ua->cookie_jar->load(); # $::ua-> # # $ua = LWP::UserAgent->new; # $req = HTTP::Request->new(GET => 'http://www.linpro.no/secret/'); # $req->authorization_basic('aas', 'mypassword'); # print $ua->request($req)->as_string; # Hot pipes $| = 1; #set default speedlimit $::speedLimit = 10; ##--- ## test(); sub test { my ($target, $text, $editTime, $startTime, $token); #$target = "Special:Userlogin"; #($text, $editTime, $startTime, $token) = getPage($target); # temporary $::nullOK = "yes"; $target = "Wikipedia:Sandbox"; ($text, $editTime, $startTime, $token) = getPage($target); print $text; # die ("nopost Test complete."); $text .= "\nEat my electrons! -- testing Pearle clone ([[User:Lar]]) \n"; print "---\n"; postPage ($target, $editTime, $startTime, $token, $text, "Test 028", "najor"); # (no it is not minor) die ("Test complete."); } ##--- sub getPage { my ($target, $request, $response, $reply, $text, $text2, $editTime, $startTime, $attemptStartTime, $attemptFinishTime, $token); $target = $_[0]; if ($target =~ m/^\s*$/) { myLog("getPage: Null target."); die("getPage: Null target."); } # urlSafe ($target); # Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time(); # Create a request-object print "GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n"; myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit"); $response = $::ua->request($request); if ($response->is_success) { $reply = $response->content; # Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time(); retry ("success", "getPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); # This detects whether or not we're logged in. unless ($reply =~ m%<a href="/wiki/User_talk:Lar">My talk</a>%) { # We've lost our identity. myLog ("Wiki server is not recognizing me (1).\n---\n${reply}\n---\n"); die ("Wiki server is not recognizing me (1).\n"); } my $saveReply=$reply; myLog ("Dump reply prior to regex processing in getPage... \n---\n${saveReply}\n---\n"); $reply =~ m%<textarea\s+tabindex='1'\s+accesskey=","\s+name="wpTextbox1"\s+id="wpTextbox1"\s+rows='25'\s+cols='80'\s+>(.*?)</textarea>%s; $text = $1; # $reply =~ m%<textarea\s+tabindex='1'\s+accesskey=","\s+name="wpTextbox1"\s+id="wpTextbox1"\s+rows='25'\s+cols='80'\s+>(.*?)</textarea>%s; # $reply =~ m%<textarea\s*tabindex='1'\s*accesskey=","\s*name="wpTextbox1"\s*rows='25'\s*cols='80'\s*>(.*?)</textarea>%s; # $text = $1; # print "debug: 1: ".$1."\n"; $reply =~ m/value="(\d+)" name="wpEdittime"/; $editTime = $1; # Added 22 Aug 2005 to correctly handle articles that have # been undeleted $reply =~ m/value="(\d+)" name="wpStarttime"/; $startTime = $1; # Added 9 Mar 2005 after recent software change. $reply =~ m/value="(\w+)" name="wpEditToken"/; $token = $1; ### if (($text =~ m/^\s*$/) and ($::nullOK ne "yes")) { myLog ("getPage($target): Null text!\n"); myLog "\n---\n$reply\n---\n"; die ("getPage($target): Null text!\n"); } if (($editTime =~ m/^\s*$/) and ($::nullOK ne "yes")) { myLog ("getPage($target): Null time!\n"); myLog "\n---\n$reply\n---\n"; die ("getPage($target): Null time!\n"); } if (($text =~ m/>/) or ($text =~ m/</)) { print $text; myLog "\n---\n$text\n---\n"; myLog ("getPage($target): Bad text suck!\n"); die ("getPage($target): Bad text suck!\n"); } # Change ( " -> " ) etc # This function is from HTML::Entities. decode_entities($text); # This may or may not actually work $::ua->cookie_jar->save(); return ($text, $editTime, $startTime, $token); } else { myLog ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n".$response->content."\n"); print ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/) or ($response->status_line =~ m/^502/) or ($response->status_line =~ m/^503/)) { return(retry("getPage", @_)); } else { # Unhandled HTTP response die ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n"); } } } sub postPage { my ($request, $response, $pageName, $textToPost, $summaryEntry, $editTime, $startTime, $actual, $expected, $attemptStartTime, $attemptFinishTime, $date, $editToken, $minor); $pageName = $_[0]; $editTime = $_[1]; $startTime = $_[2]; $editToken = $_[3]; $textToPost = $_[4]; $summaryEntry = $_[5]; # Max 200 chars! $minor = $_[6]; $summaryEntry = substr($summaryEntry, 0, 200); if ($pageName eq "") { myLog ("postPage(): Empty pageName.\n"); die ("postPage(): Empty pageName.\n"); } if ($summaryEntry eq "") { $summaryEntry = "Automated editing."; } # Monitor server responsiveness $attemptStartTime = Time::HiRes::time(); if ($minor eq "yes") { $request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageName}&action=submit", [wpTextbox1 => $textToPost, wpSummary => $summaryEntry, wpSave => "Save page", wpMinoredit => "on", wpEditToken => $editToken, wpStarttime => $startTime, wpEdittime => $editTime]; # Optional: wpWatchthis } else { $request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageName}&action=submit", [wpTextbox1 => $textToPost, wpSummary => $summaryEntry, wpSave => "Save page", wpEditToken => $editToken, wpStarttime => $startTime, wpEdittime => $editTime]; # Optional: wpWatchthis, wpMinoredit } # --- ## If posts are failing, you can uncomment the below to see what ## HTTP request is being made. # myLog($request->as_string()); # print $request->as_string(); $::speedLimit = 60 * 10; # print $::ua->request($request)->as_string; # --- myLog("POSTing..."); print "POSTing..."; # Pass request to the user agent and get a response back $response = $::ua->request($request); myLog("POSTed.\n"); print "POSTed.\n"; if ($response->content =~ m/Please confirm that really want to recreate this article./) { myLog ($response->content."\n"); die ("Deleted article conflict! See log!"); } # Check the outcome of the response if (($response->is_success) or ($response->is_redirect)) { # Monitor server responsiveness $attemptFinishTime = Time::HiRes::time(); retry ("success", "postPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); $expected = "302 Moved Temporarily"; $actual = $response->status_line; if (($expected ne $actual) and ($actual ne "200 OK")) { myLog ("postPage(${pageName}, $editTime)#1 - expected =! actual\n"); myLog ($request->as_string()); myLog ("EXPECTED: '${expected}'\n"); myLog (" ACTUAL: '${actual}'\n"); die ("postPage(${pageName}, $editTime)#1 - expected =! actual - see log\n"); } $expected = "http://en.wikipedia.org/wiki/${pageName}"; $expected =~ s/\'/%27/g; $expected =~ s/\*/%2A/g; # $expected = urlEncode($expected); $actual = $response->headers->header("Location"); if (($expected ne $actual) and !(($actual eq "") and ($response->status_line eq "200 OK"))) { myLog ("postPage(${pageName}, $editTime)#2 - expected =! actual\n"); myLog ("EXPECTED: '${expected}'\n"); myLog (" ACTUAL: '${actual}'\n"); die ("postPage(${pageName}, $editTime)#2 - expected =! actual - see log\n"); } if ($response->content =~ m/<h1 class="firstHeading">Edit conflict/) { myLog ("Edit conflict on '$pageName' at '$editTime'!\n"); die ("Edit conflict on '$pageName' at '$editTime'!\n"); } $::ua->cookie_jar->save(); return ($response->content); } else { $date = `date`; $date =~ s/\n//g; myLog ("Bad response to POST to $pageName at $date.\n".$response->status_line."\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/) or ($response->status_line =~ m/^502/) or ($response->status_line =~ m/^503/)) { print "Bad response to POST to $pageName at $date.\n".$response->status_line."\n".$response->content."\n"; return(retry("postPage", @_)); } else { # Unhandled HTTP response die ("Bad response to POST to $pageName at $date.\n".$response->status_line."\n"); } } } sub myLog { open (LOG, ">>pearle-wisebot.ersatz.log.txt") || die "Could not append to log!"; print LOG $_[0]; close (LOG); } # A call to this recursive function handles any retries necessary to # wait out network or server problems. It's a bit of a hack. sub retry { my ($callType, @args, $i, $normalDelay, $firstRetry, $secondRetry, $thirdRetry); ($callType, @args) = @_; ### ATTENTION ### # Increasing the speed of the bot to faster than 1 edit every 10 # seconds violates English Wikipedia rules as of April, 2005, and # will cause your bot to be banned. So don't change $normalDelay # unless you know what you are doing. Other sites may have # similar policies, and you are advised to check before using your # bot at the default speed. ################# # HTTP failures are usually an indication of high server load. # The retry settings here are designed to give human editors # priority use of the server, by allowing it ample recovering time # when load is high. # Time to wait before retry on failure, in seconds $normalDelay = 10; # Normal interval between edits is 10 seconds $firstRetry = 60; # First delay on fail is 1 minute $secondRetry = 60 * 10; # Second delay on fail is 10 minutes $thirdRetry = 60 * 60; # Third delay on fail is 1 hour # SUCCESS CASE # e.g. retry ("success", "getPage", "0.23"); if ($callType eq "success") { myLog("Response time for ".$args[0]." (sec): ".$args[1]."\n"); $::retryDelay = $normalDelay; if ($args[0] eq "postPage") { # If the response time is greater than 20 seconds... if ($args[1] > 20) { print "Wikipedia is very slow. Increasing minimum wait to 10 min...\n"; myLog("Wikipedia is very slow. Increasing minimum wait to 10 min...\n"); $::speedLimit = 60 * 10; } # If the response time is between 10 and 20 seconds... elsif ($args[1] > 10) { print "Wikipedia is somewhat slow. Setting minimum wait to 60 sec...\n"; myLog("Wikipedia is somewhat slow. Setting minimum wait to 60 sec...\n"); $::speedLimit = 60; } # If the response time is less than 10 seconds... else { if ($::speedLimit > 10) { print "Returning to normal minimum wait time.\n"; myLog("Returning to normal minimum wait time.\n"); $::speedLimit = 10; } } } return(); } # e.g. retry ("getPage", "George_Washington") # FAILURE CASES elsif (($::retryDelay == $normalDelay) or ($::retryDelay == 0)) { print "First retry for ".$args[0]."\n"; myLog("First retry for ".$args[0]."\n"); $::retryDelay = $firstRetry; $::speedLimit = 60 * 10; } elsif ($::retryDelay == $firstRetry) { print "Second retry for ".$args[0]."\n"; myLog("Second retry for ".$args[0]."\n"); $::retryDelay = $secondRetry; $::speedLimit = 60 * 10; } elsif ($::retryDelay == $secondRetry) { print "Third retry for ".$args[0]."\n"; myLog("Third retry for ".$args[0]."\n"); $::retryDelay = $thirdRetry; $::speedLimit = 60 * 10; } elsif ($::retryDelay == $thirdRetry) { print "Nth retry for ".$args[0]."\n"; myLog("Nth retry for ".$args[0]."\n"); $::retryDelay = $thirdRetry; $::speedLimit = 60 * 10; } else { die ("retry(): Internal error - unknown delay factor '".$::retryDelay."'\n"); } # DEFAULT TO FAILURE CASE HANDLING $i = $::retryDelay; while ($i >= 0) { sleep (1); print STDERR "Waiting $i seconds for retry...\r"; $i--; } print " \r"; # DO THE ACTUAL RETRY if ($callType eq "getPage") { return(getPage(@args)); } elsif ($callType eq "postPage") { return(postPage(@args)); } elsif ($callType eq "getCategoryArticles") { return(getCategoryArticles(@args)); } elsif ($callType eq "getSubcategories") { return(getSubcategories(@args)); } elsif ($callType eq "getURL") { return(getURL(@args)); } else { myLog ("retry(): Unknown callType: $callType\n"); die ("retry(): Unknown callType: $callType\n"); } } # perl pearle.pl PRINT_WIKITEXT Article_you_want_to_get ## Warning: Saves to a file in the current directory with the same name ## as the article, plus another file with the .html extention. sub printWikitext { my ($editTime, $startTime, $text, $target, $token); $target = $_[0]; $target =~ s/^\[\[://; $target =~ s/\]\]$//; ($text, $editTime, $startTime, $token) = getPage($target); # Save the wikicode version to a file. open (WIKITEXT, ">./${target}"); print WIKITEXT $text; close (WIKITEXT); # Save the HTML version to a file. print `wget http://en.wikipedia.org/wiki/${target} -O ./${target}.html`; } 1; # don't forget to return a true value from the file
Main code
[edit]Invoke as (for example)
perl genClassTable.pl -d 2 -C Wikipedia:WikiProject_The_Beatles/Categories -a leaveOrdered.txt -q C:\shortprg\AWB\enwiki-20060303-categorylinks.sql -o bigone2c.txt >runlog2c.txt
Fillename: genClassTable.pl
#!/usr/bin/perl -w #---------------------------------------------------------------------------# # process files and generate a category table # Author: Larry Pieniazek (IBM/Ascential Software) as hobby project # Adapted from stuff I cribbed from all over. # (c)Larry Pieniazek 2006. This library is free software; you can redistribute it # and/or modify it under the same terms as Perl itself. # additionally, can be redistributed and modified under GFDL or CC-SA as you choose # # Abstract: # This perlscript is designed to parse category SQL dumps from wikipedia # which are found here: http://download.wikimedia.org/enwiki/ # For example the 23 March dump is called # http://download.wikimedia.org/enwiki/20060323/enwiki-20060323-categorylinks.sql.gz # # The parsing is to generate article classification tables such as those found at # http://en.wikipedia.org/wiki/Wikipedia_talk:WikiProject_The_Beatles/Article_Classification # # In addition to the dump (currently must have been converted to linefeed delimited tuples) # the other input is a list of categories of interest, one per line. # #---------------------------------------------------------------------------# use strict; use WP_util_pearlebot; use WP_util_ClassTable; use Getopt::Std; #---------------------------------------------------------------------------# # Main routine - # process options # read in categories desired # build hash of articles by parsing SQL file # write out table file using hash #---------------------------------------------------------------------------# # main my $rc=0; # print "prior to getopts\n"; getopts('hvd:q:a:C:c:o:', \%WP_util_ClassTable::options) or WP_util_ClassTable::Usage(); # debug also d # print "post getopts, pre process\n"; WP_util_ClassTable::ProcessOptions(); if ($WP_util_ClassTable::debug>1) { print "post process, pre read cat\n"; } if (defined $WP_util_ClassTable::options{'a'}) { # using page with art special key list WP_util_ClassTable::ReadArtKeyFile(); } else { # no list to read so make it empty %WP_util_ClassTable::artSpecialKeyHash = (); } if (defined $WP_util_ClassTable::options{'C'}) { # using page with cat lists # $rc=FetchCatPage("Wikipedia:WikiProject_The Beatles/Categories"); $rc=WP_util_ClassTable::FetchCatPage($WP_util_ClassTable::catArtPage); if ($rc) { die "error fetching category list from Wikipedia\n"; } } else { $rc=WP_util_ClassTable::ReadCatFile(); if ($rc) { die "error reading category list\n"; } } $rc=WP_util_ClassTable::ParseSQL(); if ($rc) { die "error reading SQL or building structure\n"; } # $rc=WP_util_ClassTable::WriteHash(); # if ($rc) { die "error writing hash\n"; } # exit 0; $rc=WP_util_ClassTable::WriteTable(); if ($rc) { die "error building table\n"; } exit 0;