Jump to content

User:ImageRemovalBot/removebot.pl

From Wikipedia, the free encyclopedia

ImageRemovalBot's code. Requires User:FairuseBot/Pearle.pm, User:FairuseBot/Pearle/WikiPage.pm, and User:FairuseBot/libBot.pm. User:ImageRemovalBot/removebot-followup.pl is used to follow up on failed removals.

#!/usr/bin/perl


# RemoveBot
#
# A bot to remove deleted images from pages

use strict;
use warnings;

use lib 'Insert the directory containing libBot.pm and Pearle.pm here';
use feature 'unicode_strings';
use Date::Calc qw(Gmtime Add_Delta_Days);
use Fcntl qw(:flock);

use libBot;

my $test = 0;

my $homedir = 'Insert the directory contining the bot\'s script here';
my $permit_interruptions = 0;	# Allow talkpage messages to stop the bot?

Pearle::init("Insert your bot name here", "Insert your bot password here", "$homedir/removebot.log","$homedir/cookies.txt");
Pearle::config(nullOK => 1, printlevel => 4, loglevel => 2);
config(username => "Insert your bot name here");

if(!Pearle::login())
{
	exit;
}

# Check for a running copy
if(-e "$homedir/pid")
{
	# Possible other copy.  Compare PIDs
	open PIDFILE, "<", "$homedir/pid";
	my $pid = <PIDFILE>;
	close PIDFILE;

	if(defined($pid) and length($pid) > 0)
	{
		my $psresult = `ps -p $pid`;
		if($psresult =~ /removebot.pl/)
		{
			botwarnlog("\n*Previous run is taking longer than normal");
			exit;
		}
	}
	else
	{
		Pearle::myLog(1, "Unable to read pidfile, assuming no other copy is running\n");
	}
}

open PIDFILE, ">", "$homedir/pid";
print PIDFILE $$;
close PIDFILE;

# Get the last log entry processed
my ($last_date);
if(-e "$homedir/lastfile.log")
{
	open INFILE, "<", "$homedir/lastfile.log";
	$last_date = <INFILE>;
	close INFILE;
	if(defined($last_date) && length($last_date) > 0)
	{
		chomp $last_date;
	}
	else
	{
		my ($y, $m, $d, $h, $min, $s, undef, undef, undef) = Gmtime();
		($y, $m, $d) = Add_Delta_Days($y, $m, $d, -2);
		$last_date = sprintf("%02d-%02d-%02dT%02d:%02d:%02dZ", $y, $m, $d, $h, $min, $s);
		Pearle::myLog(1, "Unable to read lastfile.log, using $last_date instead\n");
	}
}
else
{
	$last_date = "1970-01-01T00:00:01Z";	# Beginning of time
}

my $total_images = 0;
my $total_processed = 0;
my $i = 1;

#for($i = 1; $i <= 100; $i++)
{
	my @images;
	my $image;
	my $images_removed = 0;
	
	@images = ();
	
	Pearle::myLog(2, "Beginning set at " . time() . "\n");

	# Get the log
	my @articles;
	if($test)
	{
		@articles = undef;
		@images = "File:RAF logo.svg";
		chomp @images;
	}
	else
	{
		@articles = Pearle::getLogArticles(log => 'delete', limit => 500, time => $last_date, dir => 'newer');
		foreach my $item (@articles)
		{
			# Get all files that were not moved to Commons
			push @images, $item->[0] if($item->[0] =~ /^(?:Image|File):/ and $item->[2] !~ /^.?F8:/);
		}
	}

	if(scalar(@articles) == 0)
	{
		Pearle::myLog(2, "Empty deletion log\n");
		exit;
	}
		
	if($test)
	{
		$last_date = undef;
	}
	else
	{
		$last_date = $articles[0]->[3];
		Pearle::myLog(2, "Last date: $last_date\n");
	}
	
	Pearle::myLog(4, join("\n", @images) . "\n");
	Pearle::myLog(2, scalar(@images) . " images found\n");
	$total_processed += scalar(@images);
	
	# Process for deleted images
	
	if(scalar(@images) == 0)
	{
		Pearle::myLog(1, "*No images in log\n");
		#exit;
	}

	foreach $image (@images)
	{
		my $image_url;
		my $image_regex = $image;
		my $page;
		my @pages = ();
		my ($day, $month, $year);
		
		# Perform various checks that can be done using the canonical image name
		my $image_data = Pearle::APIQuery(titles => [$image], prop => 'imageinfo', meta => 'userinfo', uiprop => ['hasmsg'], 				# Basic data
		                                  list => 'backlinks', bltitle => $image, blnamespace => [6], bllimit => 500, blfilterredir => 'redirects');	# Image names

		my $full_comment = "";
		my $removal_prefix = "Deleted image removed:";
		my $removal_comment = "Removing links to deleted file image";

		if($permit_interruptions and DoIHaveMessages($image_data))
		{
			Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n");
			exit;
		}
		
		# Verify the image is still deleted
		if($image_data !~ /missing=""/)
		{
			Pearle::myLog(2, "*Image [[:$image]] has been re-uploaded.\n");
			next;
		}
		
		# Images from Commons.  May have been masked by the deleted version.
		if($image_data =~ /imagerepository="shared"/)
		{
			Pearle::myLog(2, "*Commons image [[:$image]] found\n");
			next;
		}

		# Check for bug 33292 (e.g. http://en.wikipedia.org/w/api.php?action=query&format=xml&prop=imageinfo&titles=File:RAF%20logo.svg&iilimit=10)
		# Note that this check must be done *after* the Commons check, as the API returns are
		# almost identical -- the only difference is that an image on Commons will be flagged as
		# coming from a shared repository.
		if($image_data =~ /imageinfo/)
		{
			botwarnlog("\n* File [[:$image]] is in an inconsistent state.");
			Pearle::myLog(2, "*File [[:$image]] is in an inconsistent state.\n");
			next;
		}
		
		if($image !~ /(\.jpg|\.jpeg|\.png|\.gif|\.svg|\.tiff|\.tif)$/i and $image !~ /^http:\/\//i)
		{
			botwarnlog("\n* Non-image media file [[:$image]] found.");
			Pearle::myLog(2, "*Non-image media file [[:$image]] found.\n");
			next;	# Non-image files are too hard to work with
		}

		# Perform operations on the image and all redirects
		my @image_names = GetImageNames($image_data);
		push @image_names, $image;
		Pearle::myLog(2, "*Image has names [[:", join "]], [[:", @image_names, "]]\n");
		
		foreach my $image_name (@image_names)
		{
		
			$image_data = Pearle::APIQuery(list => 'imageusage', iutitle => $image_name, iunamespace => [0, 10, 12, 14, 100], iulimit => 500);
			@pages = GetPageList($image_data);

			if(scalar(@pages) == 0)
			{
				notelog("Image $image_name is already orphaned\n");
				next;
			}

			my ($raw_image) = $image_name =~ /(?:Image|File):(.*)/;
			$raw_image = MakeWikiRegex($raw_image);

			$image_regex = "[ _]*(?:[Ii][Mm][Aa][Gg][Ee]|[Ff][Ii][Ll][Ee])[ _]*:[ _]*${raw_image}[ _]*";

			# Sanity check
			if(!defined($raw_image) or $image_name !~ /$raw_image/)
			{
				botwarnlog("\n*Parse error on image [[:$image_name]] ($raw_image)");
				exit;
			}
			Pearle::myLog(3, "Image regex: $image_regex\n");

			my $parsed_removal_comment = $removal_comment;
			$parsed_removal_comment =~ s/image/[[:$image]]/;
 			foreach $page (@pages)
			{
				eval
				{
					my $hits = 0;
					Pearle::myLog(3, "Page for removal: $page\n");
					if($hits = RemoveImageFromPage($image_name, $page, $image_regex, $removal_prefix, $parsed_removal_comment)) 	# Don't limit if we just touched the article
					{
						Pearle::myLog(2, "Removed image $image with name $image_name from article $page ($hits times)\n");
						Pearle::limit();
					}
					$images_removed += $hits;
				};
				if($@)
				{
					if(925 == $@)
					{
						botwarnlog("\n*Page [[:$page]] is protected removing image [[:$image_name]]");
					}
					else
					{
						die;
					}
				}
			}

			# Verify removal
			# Portal removal is too hard to get correct, and we don't really care about it.
			# Template removal isn't possible, and the template usage has already been logged.
			$image_data = Pearle::APIQuery(list => 'imageusage', iutitle => $image_name, iunamespace => [0, 12, 14], iulimit => 500);
			@pages = GetPageList($image_data);

			if(scalar(@pages) != 0)
			{
			#	botwarnlog("\n*Unable to remove all instances of [[:$image]]");
				Pearle::myLog(2, "*Unable to remove all instances of [[:$image]], adding to followup log\n");
				open OUTFILE, ">>", "followup.log";
				flock OUTFILE, LOCK_EX;
				my $date = time;
				print OUTFILE "$date $image\n";
				flock OUTFILE, LOCK_UN;
				close OUTFILE;
			}

#			sleep 30;
		}
	}
	Pearle::myLog(2, "Finished with set.  Removed $images_removed images.\n");
	$total_images += $images_removed;

	# Record the last log entry processed
	if(!$test)
	{
		open OUTFILE, ">", "$homedir/lastfile.log";
		print OUTFILE "$last_date\n";
		print "$last_date\n";
		close OUTFILE;
	}
}

#print "Finished.  Total $total_images removed, $total_processed processed.\n";

unlink "$homedir/pid"