Jump to content

User:Full-date unlinking bot/code: Difference between revisions

From Wikipedia, the free encyclopedia
Content deleted Content added
Drop true nb-space, support. Keeps getting lost during cut&paste. Not worth hassle.
Added page blanking detection and avoidance
Line 310: Line 310:
$sqlquery = query("INSERT INTO `unlinked` (`name`) VALUES (\"$link\")");
$sqlquery = query("INSERT INTO `unlinked` (`name`) VALUES (\"$link\")");
if ($contents != $contents_archive) {
if (strlen($contents) == 0) {
echo "Contents blanked during processing of article \"$link\". Skipping save step.";
}
else if ($contents != $contents_archive) {
$return_code = $objwiki->edit($link,$contents,$editsummary,true,true,null,true); // posts the change. The two "true" parameters indicate that this is a bot edit and it is a minor edit
$return_code = $objwiki->edit($link,$contents,$editsummary,true,true,null,true); // posts the change. The two "true" parameters indicate that this is a bot edit and it is a minor edit
if ($return_code['error']['code']=='editconflict') {
if ($return_code['error']['code']=='editconflict') {

Revision as of 18:15, 10 October 2009

<?php
/** fulldateunlinker.php -- Removes link tags from dates
 *  Release Candidate 2
 *
 *  (c) 2009 James Hare (Harej) and others - http://en.wikipedia.org/wiki/User:Harej
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *   
 *	  Developers (add your self here if you worked on the code):
 *    [[User:Harej]]   - Initial code
 *    [[User:Chris G]] - MediaWiki API and database interfacing
 *    [[User:Tcncv]]   - Date-parsing regular expressions, unlinker()
 **/
ini_set("display_errors", 1);
error_reporting(E_ALL ^ E_NOTICE);
include("./public_html/botclasses.php");  // Botclasses.php was written by User:Chris_G and is available under the GNU General Public License
include("fdublogin.php");
 
// For the purposes of unambiguous documentation, the Month-Day-Year style of writing dates will be referred to as "American" and the Day-Month-Year style "British".
// I understand how not-right this is but I felt it was necessary to use two terms that could not be confused with each other.
// ("International" would be a good replacement for "British", but "i" could be confused for "1", plus "int" means "integer".)

// I'm sorry, Chris, but I had to ditch the toolserver DB interfacing in favor of API interfacing.
// The database parts were not working, and there was no way to tell how to fix it because no errors were being put out.
// So I did the easy thing and put the old method, which works, back in.
 
echo "Logging in...";
$objwiki = new wikipedia();
$objwiki->login($botuser, $botpass);
echo " done.\n";

$contents = "";

/* Connect to the database */
echo "Retrieving database login credentials...";
$toolserver_mycnf = parse_ini_file("/home/messedrocker/.my.cnf");
$toolserver_username = $toolserver_mycnf['user'];
$toolserver_password = $toolserver_mycnf['password'];
unset($toolserver_mycnf);
echo " done.\n";

echo "Logging into database...";
mysql_connect("sql",$toolserver_username,$toolserver_password);
@mysql_select_db('u_messedrocker_reqs') or die(mysql_error());
echo " done.\n";

function query($query) {
	// we need to use this function in case our MySQL connection times out
	global $toolserver_username;
	global $toolserver_password;
	if (!mysql_ping()) {
		mysql_connect("sql",$toolserver_username,$toolserver_password);
		@mysql_select_db('u_messedrocker_reqs') or die(mysql_error());
	}
	return mysql_query($query) or die(mysql_error());
}

 
function overridecheck() {
	// This checks to see if [[User:Full-date unlinking bot/Manual override]] has been triggered by the placement of the string "Joe Biden" anywhere on the page.
	// I chose the Vice President of the United States as the "safety word" because it can't be triggered accidentally. And because I'm nuts.
 
 	global $objwiki;
	$overridepage = $objwiki->getpage("User:Full-date unlinking bot/manual override");
 
	if (strpos($overridepage, "Joe Biden") !== false) {
		die("Manual override has been triggered. Shutting down.");
	}
}
 
function checktoprocess($page) {
	// checktoprocess checks if $page should be processed.
	// First, it checks if the page has already been processed based on a comment that is left by the bot after each page is processed.
	// Checks are then performed based on the exclusion criteria on the bot's user page
	// If any of these tests fail, "false" is returned; otherwise, "true" is returned.
 
	$regex1 = "/^(January|February|March|April|May|June|July|August|September|October|November|December)(\s\d{1,2})?/"; // matches Month-Date
	$regex2 = "/^\d{1,4}(st|rd|th|nd|s)?\s?(century|millennium)?( BC)?/i"; // matches year, century, and millennium articles, BC and AD
	$regex3 = "/^List of \d{1,4}(st|rd|th|nd|s)?\s?(century|millennium)?( BC)?/i"; // List of (year or year range) Xs
	$regex4 = "/^List of .* in (the )?\d{1,4}(st|rd|th|nd|s)?\s?(century|millennium)?( BC)?/i"; // List of Xs in the (year or year range)
	if (preg_match($regex1, $page) || preg_match($regex2, $page) || preg_match($regex3, $page) || preg_match($regex4, $page)) {
		return false;
	}
 	
 	global $objwiki;
	global $contents;
	
	do {
	$contents = $objwiki->getpage($page,null,true);
	} while ($contents == "");
 
	global $botuser;
	if (!$objwiki->nobots($page,$botuser,$contents)) {
		return false;
	}
 
 	$check = mysql_query("select * from `unlinked` where `name`=\"$page\"");
 	$row = mysql_fetch_assoc($check);
	if ($row['name'] == $page) {
		return false;
	}
	else {
		return true;
	}
	/* President Clinton called. He wants his era's shitty way of storing data back. */
}
 
function unlinker($link) {
	global $objwiki;
	global $contents;
	$contents_archive = $contents; // this is to maintain an unchanged version for comparison purposes. if there is no change, the bot will not send the API request to edit the page
	$editsummary = "Unlinking full-dates. [[User:Full-date unlinking bot|Details here]]. Codes: ";
 
	//==========  Define regular expression date building blocks 
 
	// Root date components
	$part_m = 'Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|'
		. 'July?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?';
	$part_d = '\d{1,2}';
	$part_y = '\d{1,4}(?:[ _]BC)?';
 
	// Captured date components
	$part_c_m = '(' . $part_m . ')';
	$part_c_d = '(' . $part_d . ')';
	$part_c_y = '(' . $part_y . ')';
 
	// Linked captured date components
	$part_lc_md = '\[\[' . $part_c_m . '[ _]' . $part_c_d . '\]\]';  // [[month day]]
	$part_lc_dm = '\[\[' . $part_c_d . '[ _]' . $part_c_m . '\]\]';  // [[day month]]
	$part_lc_y = '\[\[' . $part_c_y . '\]\]';  // [[year]]
 
	//==========  Build regular expressions for dmy, mdy, and (less common) ymd formats
 
	$part_AMreg_punct = ', ';
	$part_BRreg_punct = ' ';
	$part_AModd_punct = '(?!, \[)(?: *(?:, *)?)';  // spaces and optional comma, excluding comma + single space
	$part_BRodd_punct = '(?! \[)(?: *(?:, *)?)';   // spaces and optional comma, excluding single space
	$part_YMD_punct = ' *';                        // Recognize only spaces (zero or more)
 
	$regex_AMreg = '/' . $part_lc_md . $part_AMreg_punct . $part_lc_y . '/i';
	$regex_BRreg = '/' . $part_lc_dm . $part_BRreg_punct . $part_lc_y . '/i';
	$regex_AModd = '/' . $part_lc_md . $part_AModd_punct . $part_lc_y . '/i';
	$regex_BRodd = '/' . $part_lc_dm . $part_BRodd_punct . $part_lc_y . '/i';
	$regex_YMD   = '/' . $part_lc_y . $part_YMD_punct . $part_lc_md . '/i';
 
	$replace_AM = '§~§$1 $2, $3';  // "§~§" is a marker (deleted later) that supports late list processing
	$replace_BR = '§~§$1 $2 $3';
	$replace_YMD = '$1 $2 $3';
 
	// For information and review purposes, the above expressions are equivalent to:
	// $regex_AMreg = '/\[\[(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[ _](\d{1,2})\]\], \[\[(\d{1,4}(?:[ _]BC)?)\]\]/i'
	// $regex_BRreg = '/\[\[(\d{1,2})[ _](Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\]\] \[\[(\d{1,4}(?:[ _]BC)?)\]\]/i'
	// $regex_AModd = '/\[\[(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[ _](\d{1,2})\]\](?!, \[)(?: *(?:, *)?)\[\[(\d{1,4}(?:[ _]BC)?)\]\]/i'
	// $regex_BRodd = '/\[\[(\d{1,2})[ _](Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\]\](?! \[)(?: *(?:, *)?)\[\[(\d{1,4}(?:[ _]BC)?)\]\]/i'
	// $regex_YMD   = '/\[\[(\d{1,4}(?:[ _]BC)?)\]\] *\[\[(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[ _](\d{1,2})\]\]/i'
 
	//==========  Define regular expressions for ISO 8601 like dates.
 
	// Negative year forms are also recognized, but will likely never be encountered.  Technically, 
	// ISO 8601 dates are only valid for years 1583 through 9999 of the Gregorian calendar, but
	// we will not enforce those rules here.
	$regex_ISO1 = '/' . '\[\[(-?\d{4}-\d{2}-\d{2})\]\]' . '/i';            // [[yyyy-mm-dd]] or [[-yyyy-mm-dd]]
	$regex_ISO2 = '/' . '\[\[(-?\d{4})\]\]-\[\[(\d{2}-\d{2})\]\]' . '/i';  // [[yyyy]]-[[mm-dd]] or [[-yyyy]]-[[mm-dd]]
	$replace_ISO1 = '$1';
	$replace_ISO2 = '$1-$2';
 
	//==========  Define regular expressions to extend processing for date ranges and lists
 
	// For date lists, build a single match and capture expression that allows any of the forms:
	// [[mmm dd]], [[dd mmm]], [[mmm dd|dd]] and [[dd mmm|dd]] (and even [[mmm dd|]] and [[dd mmm|]]).
	// Three values are captured -- first date part, second date part, and pipe text (or blank if not present).
	// The first two date parts may be month+day, day+month, but not month+month or day+day.
	$part_c_m_or_d = '(' . $part_d . '|' . $part_m . ')';
	$part_c_pipe_day = '((?:[|](?:' . $part_d . ')?)?)';  // Matches nothing, pipe + day. or pipe + empty string
	$part_verify_m_and_d = '(?=[^\]|]*[A-Z])' . '(?=[^\]|]*\d)';  // Lookahead to verify m+d or d+m (not m+m or d+d)
	$part_lc_general = '\[\[' . $part_verify_m_and_d . $part_c_m_or_d . '[ _]' . $part_c_m_or_d . $part_c_pipe_day . '\]\]';
 
	// Define words and punctuation that may appear between items of a date range or list.
	// Optional comma, whitespace, separator punctuation or word, more whitespace
	$part_list_commaopt = ',?';
	$part_list_spacing = '(?: |&nbsp;|<br */?>)*'; // Zero or more: Space, symbolic nb-space, line break
	$part_list_word =
		'(?:-|–|—|−|~'      // hyphen, en dash, em dash, minus, tilda
		. '|/|&|[+]|×|x|,|;'  // slash, ampersand, plus, times, letter x, comma, semicolon
		. '|to|and|or|until|till|til|through|thru|into'
		. '|&ndash;|&mdash;|\{\{ndash\}\}'
		. ')';
	$part_c_list_separator = '(' . $part_list_commaopt . $part_list_spacing . '(?:' . $part_list_word . $part_list_spacing . ')?)';
 
	// In the following expressions "§~§" and "~" are used as placeholders.  They have no special meaning.
 
	// Define expression to search for a date list not anchored by a standard form of the mdy or
	// dmy date.  This will catch the cases where the date part immediately preceding the year
	// is piped.  Punctuation is left unchanged.
	// Example: [[April 23]]/[[April 24|24]], [[1966]]
	// ...will be replaced with "§~§April~23~~/§~§April~24~|24~, 1966".
	// ...which will later be cleaned up as "April 23/24, 1966"
	$regex_list_base = '@' . $part_lc_general . $part_c_list_separator . $part_lc_general . '( *(?:, *)?)' . $part_lc_y . '@i';
	$replace_list_base = '§~§~$1~$2~$3~' . '$4' . '§~§~$5~$6~$7~' . '$8' . '$9';
 
	// Search for additional month-day or day-month parts to the left of already processed dates.
	$regex_list_extend = '@' . $part_lc_general . $part_c_list_separator . '(?=§~§)' . '@i';
	$replace_list_extend = '§~§~$1~$2~$3~' . '$4';
 
	// Convert intermediate date list string replacements with final unlinked form ("month day", "day month" or "day").
	$regex_list_cleanup_nopipe = '/' . '§~§~([^~]*)~([^~]*)~[|]?~' . '/i';
	$replace_list_cleanup_nopipe = '$1 $2';
 
	$regex_list_cleanup_pipetext = '/' . '§~§~([^~]*)~([^~]*)~[|]([^~]*)~' . '/i';
	$replace_list_cleanup_pipetext = '$3';  // Discard link target, replace with pipe text only
 
	// Remove any remaining placeholders.
	$regex_cleanup_final = '/' . '§~§' . '/i';
	$replace_cleanup_final = '';
 
	//==========  Begin search and replace ordinary dates
 
	$match_count = 0;
	$contents = preg_replace($regex_AMreg, $replace_AM, $contents, -1, &$match_count);
	if ($match_count > 0) {
		$editsummary .= "AMreg(×" . $match_count . "), ";
	}
 
	$match_count = 0;
	$contents = preg_replace($regex_BRreg, $replace_BR, $contents, -1, &$match_count);
	if ($match_count > 0) {
		$editsummary .= "BRreg(×" . $match_count . "), ";
	}
 
	$match_count = 0;
	$contents = preg_replace($regex_AModd, $replace_AM, $contents, -1, &$match_count);
	if ($match_count > 0) {
		$editsummary .= "AModd(×" . $match_count . "), ";
	}
 
	$match_count = 0;
	$contents = preg_replace($regex_BRodd, $replace_BR, $contents, -1, &$match_count);
	if ($match_count > 0) {
		$editsummary .= "BRodd(×" . $match_count . "), ";
	}
 
	$match_count = 0;
	$contents = preg_replace($regex_YMD, $replace_YMD, $contents, -1, &$match_count);
	if ($match_count > 0) {
		$editsummary .= "YMD(×" . $match_count . "), ";
	}
 
	$match_count = 0;
	$contents = preg_replace($regex_ISO1, $replace_ISO1, $contents, -1, &$match_count);
	if ($match_count > 0) {
		$editsummary .= "ISO1(×" . $match_count . "), ";
	}
 
	$match_count = 0;
	$contents = preg_replace($regex_ISO2, $replace_ISO2, $contents, -1, &$match_count);
	if ($match_count > 0) {
		$editsummary .= "ISO2(×" . $match_count . "), ";
	}
 
	//==========  Begin search and replace date lists
 
	$date_list_count = 0;
 
	// Search for two part date range or list where rightmost part did nor match one of the
	// dmy or mdy patterns above.  Will typically match "[[mmm dd]] - [[mmm dd|dd]], [[yyyy]]"
	// forms or variations.
	$match_count = 0;
	$contents = preg_replace($regex_list_base, $replace_list_base, $contents, -1, &$match_count);
	if ($match_count > 0) {
		$editsummary .= "Lists1(×" . $match_count . "), ";
	}
	$date_list_count += $match_count;
 
	// Process additional date parts to the left of a previously identified date or list.
	for ($i = 0; $i < 10; $i++) {
		$match_count = 0;
		$contents = preg_replace($regex_list_extend, $replace_list_extend, $contents, -1, &$match_count);
		if ($match_count == 0) break;
		$date_list_count += $match_count;
	}
 
	if ($date_list_count > 0) {
		$editsummary .= "Lists2(×" . $date_list_count . "), ";
	}
 
	// Finalize date list item format and remove any remaining marker strings ("§~§")
	$contents = preg_replace($regex_list_cleanup_nopipe, $replace_list_cleanup_nopipe, $contents);
	$contents = preg_replace($regex_list_cleanup_pipetext, $replace_list_cleanup_pipetext, $contents);
	$contents = preg_replace($regex_cleanup_final, $replace_cleanup_final, $contents);
 
	//==========  Postprocessing
 
	$editsummary = substr($editsummary, 0, -2); // to get rid of superfluous comma and space
 
	overridecheck(); // checks if the manual override has been triggered
 	
 	$sqlquery = query("INSERT INTO `unlinked` (`name`) VALUES (\"$link\")");
 	
	if (strlen($contents) == 0) {
		echo "Contents blanked during processing of article \"$link\".  Skipping save step.";
	}
 	else if ($contents != $contents_archive) {
		$return_code = $objwiki->edit($link,$contents,$editsummary,true,true,null,true); // posts the change. The two "true" parameters indicate that this is a bot edit and it is a minor edit
        	if ($return_code['error']['code']=='editconflict') {
           	 echo 'Edit conflict detected....';
        	}
		sleep(10);
 	}
}

$months = array("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December");
for ($i = 0; $i < count($months); $i++) { // for each month
	$links = $objwiki->whatlinkshere($months[$i], "&blnamespace=0");
	for ($j = 0; $j < count($links); $j++) {
		echo "Checking " . $links[$j] . "\n";
		if (checktoprocess($links[$j])) { // if the checktoprocess function returns true
			echo $links[$j] . " shall be processed.\n";
			unlinker($links[$j]);
		}
		else {
			echo $links[$j] . " shall NOT be processed.\n";
		}
	}
	for ($d = 1; $d < 32; $d++) { // This is like the above, except with different date combinations
		echo "Checking backlinks to " . $month . " " . $d . "\n";
		$links = $objwiki->whatlinkshere($months[$i] . " " . $d, "&blnamespace=0");
		for ($j = 0; $j < count($links); $j++) {
			echo "Checking " . $links[$j] . "\n";
			if (checktoprocess($links[$j])) {
				echo $links[$j] . " shall be processed.\n";
				unlinker($links[$j]);
			}
			else {
				echo $links[$j] . " shall NOT be processed.\n";
			}
		}
	}
}
?>