User:ImageRemovalBot/removebot.pl
Appearance
ImageRemovalBot's code. Requires User:FairuseBot/Pearle.pm, User:FairuseBot/Pearle/WikiPage.pm, and User:FairuseBot/libBot.pm. User:ImageRemovalBot/removebot-followup.pl is used to follow up on failed removals.
#!/usr/bin/perl
# RemoveBot
#
# A bot to remove deleted images from pages
use strict;
use warnings;
use lib 'Insert the directory containing libBot.pm and Pearle.pm here';
use feature 'unicode_strings';
use Date::Calc qw(Gmtime Add_Delta_Days);
use Fcntl qw(:flock);
use libBot;
my $test = 0;
my $homedir = 'Insert the directory contining the bot\'s script here';
my $permit_interruptions = 0; # Allow talkpage messages to stop the bot?
Pearle::init("Insert your bot name here", "Insert your bot password here", "$homedir/removebot.log","$homedir/cookies.txt");
Pearle::config(nullOK => 1, printlevel => 4, loglevel => 2);
config(username => "Insert your bot name here");
if(!Pearle::login())
{
exit;
}
# Check for a running copy
if(-e "$homedir/pid")
{
# Possible other copy. Compare PIDs
open PIDFILE, "<", "$homedir/pid";
my $pid = <PIDFILE>;
close PIDFILE;
if(defined($pid) and length($pid) > 0)
{
my $psresult = `ps -p $pid`;
if($psresult =~ /removebot.pl/)
{
botwarnlog("\n*Previous run is taking longer than normal");
exit;
}
}
else
{
Pearle::myLog(1, "Unable to read pidfile, assuming no other copy is running\n");
}
}
open PIDFILE, ">", "$homedir/pid";
print PIDFILE $$;
close PIDFILE;
# Get the last log entry processed
my ($last_date);
if(-e "$homedir/lastfile.log")
{
open INFILE, "<", "$homedir/lastfile.log";
$last_date = <INFILE>;
close INFILE;
if(defined($last_date) && length($last_date) > 0)
{
chomp $last_date;
}
else
{
my ($y, $m, $d, $h, $min, $s, undef, undef, undef) = Gmtime();
($y, $m, $d) = Add_Delta_Days($y, $m, $d, -2);
$last_date = sprintf("%02d-%02d-%02dT%02d:%02d:%02dZ", $y, $m, $d, $h, $min, $s);
Pearle::myLog(1, "Unable to read lastfile.log, using $last_date instead\n");
}
}
else
{
$last_date = "1970-01-01T00:00:01Z"; # Beginning of time
}
my $total_images = 0;
my $total_processed = 0;
my $i = 1;
#for($i = 1; $i <= 100; $i++)
{
my @images;
my $image;
my $images_removed = 0;
@images = ();
Pearle::myLog(2, "Beginning set at " . time() . "\n");
# Get the log
my @articles;
if($test)
{
@articles = undef;
@images = "File:RAF logo.svg";
chomp @images;
}
else
{
@articles = Pearle::getLogArticles(log => 'delete', limit => 500, time => $last_date, dir => 'newer');
foreach my $item (@articles)
{
# Get all files that were not moved to Commons
push @images, $item->[0] if($item->[0] =~ /^(?:Image|File):/ and $item->[2] !~ /^.?F8:/);
}
}
if(scalar(@articles) == 0)
{
Pearle::myLog(2, "Empty deletion log\n");
exit;
}
if($test)
{
$last_date = undef;
}
else
{
$last_date = $articles[0]->[3];
Pearle::myLog(2, "Last date: $last_date\n");
}
Pearle::myLog(4, join("\n", @images) . "\n");
Pearle::myLog(2, scalar(@images) . " images found\n");
$total_processed += scalar(@images);
# Process for deleted images
if(scalar(@images) == 0)
{
Pearle::myLog(1, "*No images in log\n");
#exit;
}
foreach $image (@images)
{
my $image_url;
my $image_regex = $image;
my $page;
my @pages = ();
my ($day, $month, $year);
# Perform various checks that can be done using the canonical image name
my $image_data = Pearle::APIQuery(titles => [$image], prop => 'imageinfo', meta => 'userinfo', uiprop => ['hasmsg'], # Basic data
list => 'backlinks', bltitle => $image, blnamespace => [6], bllimit => 500, blfilterredir => 'redirects'); # Image names
my $full_comment = "";
my $removal_prefix = "Deleted image removed:";
my $removal_comment = "Removing links to deleted file image";
if($permit_interruptions and DoIHaveMessages($image_data))
{
Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n");
exit;
}
# Verify the image is still deleted
if($image_data !~ /missing=""/)
{
Pearle::myLog(2, "*Image [[:$image]] has been re-uploaded.\n");
next;
}
# Images from Commons. May have been masked by the deleted version.
if($image_data =~ /imagerepository="shared"/)
{
Pearle::myLog(2, "*Commons image [[:$image]] found\n");
next;
}
# Check for bug 33292 (e.g. http://en.wikipedia.org/w/api.php?action=query&format=xml&prop=imageinfo&titles=File:RAF%20logo.svg&iilimit=10)
# Note that this check must be done *after* the Commons check, as the API returns are
# almost identical -- the only difference is that an image on Commons will be flagged as
# coming from a shared repository.
if($image_data =~ /imageinfo/)
{
botwarnlog("\n* File [[:$image]] is in an inconsistent state.");
Pearle::myLog(2, "*File [[:$image]] is in an inconsistent state.\n");
next;
}
if($image !~ /(\.jpg|\.jpeg|\.png|\.gif|\.svg|\.tiff|\.tif)$/i and $image !~ /^http:\/\//i)
{
botwarnlog("\n* Non-image media file [[:$image]] found.");
Pearle::myLog(2, "*Non-image media file [[:$image]] found.\n");
next; # Non-image files are too hard to work with
}
# Perform operations on the image and all redirects
my @image_names = GetImageNames($image_data);
push @image_names, $image;
Pearle::myLog(2, "*Image has names [[:", join "]], [[:", @image_names, "]]\n");
foreach my $image_name (@image_names)
{
$image_data = Pearle::APIQuery(list => 'imageusage', iutitle => $image_name, iunamespace => [0, 10, 12, 14, 100], iulimit => 500);
@pages = GetPageList($image_data);
if(scalar(@pages) == 0)
{
notelog("Image $image_name is already orphaned\n");
next;
}
my ($raw_image) = $image_name =~ /(?:Image|File):(.*)/;
$raw_image = MakeWikiRegex($raw_image);
$image_regex = "[ _]*(?:[Ii][Mm][Aa][Gg][Ee]|[Ff][Ii][Ll][Ee])[ _]*:[ _]*${raw_image}[ _]*";
# Sanity check
if(!defined($raw_image) or $image_name !~ /$raw_image/)
{
botwarnlog("\n*Parse error on image [[:$image_name]] ($raw_image)");
exit;
}
Pearle::myLog(3, "Image regex: $image_regex\n");
my $parsed_removal_comment = $removal_comment;
$parsed_removal_comment =~ s/image/[[:$image]]/;
foreach $page (@pages)
{
eval
{
my $hits = 0;
Pearle::myLog(3, "Page for removal: $page\n");
if($hits = RemoveImageFromPage($image_name, $page, $image_regex, $removal_prefix, $parsed_removal_comment)) # Don't limit if we just touched the article
{
Pearle::myLog(2, "Removed image $image with name $image_name from article $page ($hits times)\n");
Pearle::limit();
}
$images_removed += $hits;
};
if($@)
{
if(925 == $@)
{
botwarnlog("\n*Page [[:$page]] is protected removing image [[:$image_name]]");
}
else
{
die;
}
}
}
# Verify removal
# Portal removal is too hard to get correct, and we don't really care about it.
# Template removal isn't possible, and the template usage has already been logged.
$image_data = Pearle::APIQuery(list => 'imageusage', iutitle => $image_name, iunamespace => [0, 12, 14], iulimit => 500);
@pages = GetPageList($image_data);
if(scalar(@pages) != 0)
{
# botwarnlog("\n*Unable to remove all instances of [[:$image]]");
Pearle::myLog(2, "*Unable to remove all instances of [[:$image]], adding to followup log\n");
open OUTFILE, ">>", "followup.log";
flock OUTFILE, LOCK_EX;
my $date = time;
print OUTFILE "$date $image\n";
flock OUTFILE, LOCK_UN;
close OUTFILE;
}
# sleep 30;
}
}
Pearle::myLog(2, "Finished with set. Removed $images_removed images.\n");
$total_images += $images_removed;
# Record the last log entry processed
if(!$test)
{
open OUTFILE, ">", "$homedir/lastfile.log";
print OUTFILE "$last_date\n";
print "$last_date\n";
close OUTFILE;
}
}
#print "Finished. Total $total_images removed, $total_processed processed.\n";
unlink "$homedir/pid"