User:Full-date unlinking bot/code: Difference between revisions
Appearance
Content deleted Content added
Drop true nb-space, support. Keeps getting lost during cut&paste. Not worth hassle. |
Added page blanking detection and avoidance |
||
Line 310: | Line 310: | ||
$sqlquery = query("INSERT INTO `unlinked` (`name`) VALUES (\"$link\")"); |
$sqlquery = query("INSERT INTO `unlinked` (`name`) VALUES (\"$link\")"); |
||
if (strlen($contents) == 0) { |
|||
echo "Contents blanked during processing of article \"$link\". Skipping save step."; |
|||
} |
|||
else if ($contents != $contents_archive) { |
|||
$return_code = $objwiki->edit($link,$contents,$editsummary,true,true,null,true); // posts the change. The two "true" parameters indicate that this is a bot edit and it is a minor edit |
$return_code = $objwiki->edit($link,$contents,$editsummary,true,true,null,true); // posts the change. The two "true" parameters indicate that this is a bot edit and it is a minor edit |
||
if ($return_code['error']['code']=='editconflict') { |
if ($return_code['error']['code']=='editconflict') { |
Revision as of 18:15, 10 October 2009
<?php
/** fulldateunlinker.php -- Removes link tags from dates
* Release Candidate 2
*
* (c) 2009 James Hare (Harej) and others - http://en.wikipedia.org/wiki/User:Harej
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Developers (add your self here if you worked on the code):
* [[User:Harej]] - Initial code
* [[User:Chris G]] - MediaWiki API and database interfacing
* [[User:Tcncv]] - Date-parsing regular expressions, unlinker()
**/
ini_set("display_errors", 1);
error_reporting(E_ALL ^ E_NOTICE);
include("./public_html/botclasses.php"); // Botclasses.php was written by User:Chris_G and is available under the GNU General Public License
include("fdublogin.php");
// For the purposes of unambiguous documentation, the Month-Day-Year style of writing dates will be referred to as "American" and the Day-Month-Year style "British".
// I understand how not-right this is but I felt it was necessary to use two terms that could not be confused with each other.
// ("International" would be a good replacement for "British", but "i" could be confused for "1", plus "int" means "integer".)
// I'm sorry, Chris, but I had to ditch the toolserver DB interfacing in favor of API interfacing.
// The database parts were not working, and there was no way to tell how to fix it because no errors were being put out.
// So I did the easy thing and put the old method, which works, back in.
echo "Logging in...";
$objwiki = new wikipedia();
$objwiki->login($botuser, $botpass);
echo " done.\n";
$contents = "";
/* Connect to the database */
echo "Retrieving database login credentials...";
$toolserver_mycnf = parse_ini_file("/home/messedrocker/.my.cnf");
$toolserver_username = $toolserver_mycnf['user'];
$toolserver_password = $toolserver_mycnf['password'];
unset($toolserver_mycnf);
echo " done.\n";
echo "Logging into database...";
mysql_connect("sql",$toolserver_username,$toolserver_password);
@mysql_select_db('u_messedrocker_reqs') or die(mysql_error());
echo " done.\n";
function query($query) {
// we need to use this function in case our MySQL connection times out
global $toolserver_username;
global $toolserver_password;
if (!mysql_ping()) {
mysql_connect("sql",$toolserver_username,$toolserver_password);
@mysql_select_db('u_messedrocker_reqs') or die(mysql_error());
}
return mysql_query($query) or die(mysql_error());
}
function overridecheck() {
// This checks to see if [[User:Full-date unlinking bot/Manual override]] has been triggered by the placement of the string "Joe Biden" anywhere on the page.
// I chose the Vice President of the United States as the "safety word" because it can't be triggered accidentally. And because I'm nuts.
global $objwiki;
$overridepage = $objwiki->getpage("User:Full-date unlinking bot/manual override");
if (strpos($overridepage, "Joe Biden") !== false) {
die("Manual override has been triggered. Shutting down.");
}
}
function checktoprocess($page) {
// checktoprocess checks if $page should be processed.
// First, it checks if the page has already been processed based on a comment that is left by the bot after each page is processed.
// Checks are then performed based on the exclusion criteria on the bot's user page
// If any of these tests fail, "false" is returned; otherwise, "true" is returned.
$regex1 = "/^(January|February|March|April|May|June|July|August|September|October|November|December)(\s\d{1,2})?/"; // matches Month-Date
$regex2 = "/^\d{1,4}(st|rd|th|nd|s)?\s?(century|millennium)?( BC)?/i"; // matches year, century, and millennium articles, BC and AD
$regex3 = "/^List of \d{1,4}(st|rd|th|nd|s)?\s?(century|millennium)?( BC)?/i"; // List of (year or year range) Xs
$regex4 = "/^List of .* in (the )?\d{1,4}(st|rd|th|nd|s)?\s?(century|millennium)?( BC)?/i"; // List of Xs in the (year or year range)
if (preg_match($regex1, $page) || preg_match($regex2, $page) || preg_match($regex3, $page) || preg_match($regex4, $page)) {
return false;
}
global $objwiki;
global $contents;
do {
$contents = $objwiki->getpage($page,null,true);
} while ($contents == "");
global $botuser;
if (!$objwiki->nobots($page,$botuser,$contents)) {
return false;
}
$check = mysql_query("select * from `unlinked` where `name`=\"$page\"");
$row = mysql_fetch_assoc($check);
if ($row['name'] == $page) {
return false;
}
else {
return true;
}
/* President Clinton called. He wants his era's shitty way of storing data back. */
}
function unlinker($link) {
global $objwiki;
global $contents;
$contents_archive = $contents; // this is to maintain an unchanged version for comparison purposes. if there is no change, the bot will not send the API request to edit the page
$editsummary = "Unlinking full-dates. [[User:Full-date unlinking bot|Details here]]. Codes: ";
//========== Define regular expression date building blocks
// Root date components
$part_m = 'Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|'
. 'July?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?';
$part_d = '\d{1,2}';
$part_y = '\d{1,4}(?:[ _]BC)?';
// Captured date components
$part_c_m = '(' . $part_m . ')';
$part_c_d = '(' . $part_d . ')';
$part_c_y = '(' . $part_y . ')';
// Linked captured date components
$part_lc_md = '\[\[' . $part_c_m . '[ _]' . $part_c_d . '\]\]'; // [[month day]]
$part_lc_dm = '\[\[' . $part_c_d . '[ _]' . $part_c_m . '\]\]'; // [[day month]]
$part_lc_y = '\[\[' . $part_c_y . '\]\]'; // [[year]]
//========== Build regular expressions for dmy, mdy, and (less common) ymd formats
$part_AMreg_punct = ', ';
$part_BRreg_punct = ' ';
$part_AModd_punct = '(?!, \[)(?: *(?:, *)?)'; // spaces and optional comma, excluding comma + single space
$part_BRodd_punct = '(?! \[)(?: *(?:, *)?)'; // spaces and optional comma, excluding single space
$part_YMD_punct = ' *'; // Recognize only spaces (zero or more)
$regex_AMreg = '/' . $part_lc_md . $part_AMreg_punct . $part_lc_y . '/i';
$regex_BRreg = '/' . $part_lc_dm . $part_BRreg_punct . $part_lc_y . '/i';
$regex_AModd = '/' . $part_lc_md . $part_AModd_punct . $part_lc_y . '/i';
$regex_BRodd = '/' . $part_lc_dm . $part_BRodd_punct . $part_lc_y . '/i';
$regex_YMD = '/' . $part_lc_y . $part_YMD_punct . $part_lc_md . '/i';
$replace_AM = '§~§$1 $2, $3'; // "§~§" is a marker (deleted later) that supports late list processing
$replace_BR = '§~§$1 $2 $3';
$replace_YMD = '$1 $2 $3';
// For information and review purposes, the above expressions are equivalent to:
// $regex_AMreg = '/\[\[(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[ _](\d{1,2})\]\], \[\[(\d{1,4}(?:[ _]BC)?)\]\]/i'
// $regex_BRreg = '/\[\[(\d{1,2})[ _](Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\]\] \[\[(\d{1,4}(?:[ _]BC)?)\]\]/i'
// $regex_AModd = '/\[\[(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[ _](\d{1,2})\]\](?!, \[)(?: *(?:, *)?)\[\[(\d{1,4}(?:[ _]BC)?)\]\]/i'
// $regex_BRodd = '/\[\[(\d{1,2})[ _](Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\]\](?! \[)(?: *(?:, *)?)\[\[(\d{1,4}(?:[ _]BC)?)\]\]/i'
// $regex_YMD = '/\[\[(\d{1,4}(?:[ _]BC)?)\]\] *\[\[(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[ _](\d{1,2})\]\]/i'
//========== Define regular expressions for ISO 8601 like dates.
// Negative year forms are also recognized, but will likely never be encountered. Technically,
// ISO 8601 dates are only valid for years 1583 through 9999 of the Gregorian calendar, but
// we will not enforce those rules here.
$regex_ISO1 = '/' . '\[\[(-?\d{4}-\d{2}-\d{2})\]\]' . '/i'; // [[yyyy-mm-dd]] or [[-yyyy-mm-dd]]
$regex_ISO2 = '/' . '\[\[(-?\d{4})\]\]-\[\[(\d{2}-\d{2})\]\]' . '/i'; // [[yyyy]]-[[mm-dd]] or [[-yyyy]]-[[mm-dd]]
$replace_ISO1 = '$1';
$replace_ISO2 = '$1-$2';
//========== Define regular expressions to extend processing for date ranges and lists
// For date lists, build a single match and capture expression that allows any of the forms:
// [[mmm dd]], [[dd mmm]], [[mmm dd|dd]] and [[dd mmm|dd]] (and even [[mmm dd|]] and [[dd mmm|]]).
// Three values are captured -- first date part, second date part, and pipe text (or blank if not present).
// The first two date parts may be month+day, day+month, but not month+month or day+day.
$part_c_m_or_d = '(' . $part_d . '|' . $part_m . ')';
$part_c_pipe_day = '((?:[|](?:' . $part_d . ')?)?)'; // Matches nothing, pipe + day. or pipe + empty string
$part_verify_m_and_d = '(?=[^\]|]*[A-Z])' . '(?=[^\]|]*\d)'; // Lookahead to verify m+d or d+m (not m+m or d+d)
$part_lc_general = '\[\[' . $part_verify_m_and_d . $part_c_m_or_d . '[ _]' . $part_c_m_or_d . $part_c_pipe_day . '\]\]';
// Define words and punctuation that may appear between items of a date range or list.
// Optional comma, whitespace, separator punctuation or word, more whitespace
$part_list_commaopt = ',?';
$part_list_spacing = '(?: | |<br */?>)*'; // Zero or more: Space, symbolic nb-space, line break
$part_list_word =
'(?:-|–|—|−|~' // hyphen, en dash, em dash, minus, tilda
. '|/|&|[+]|×|x|,|;' // slash, ampersand, plus, times, letter x, comma, semicolon
. '|to|and|or|until|till|til|through|thru|into'
. '|–|—|\{\{ndash\}\}'
. ')';
$part_c_list_separator = '(' . $part_list_commaopt . $part_list_spacing . '(?:' . $part_list_word . $part_list_spacing . ')?)';
// In the following expressions "§~§" and "~" are used as placeholders. They have no special meaning.
// Define expression to search for a date list not anchored by a standard form of the mdy or
// dmy date. This will catch the cases where the date part immediately preceding the year
// is piped. Punctuation is left unchanged.
// Example: [[April 23]]/[[April 24|24]], [[1966]]
// ...will be replaced with "§~§April~23~~/§~§April~24~|24~, 1966".
// ...which will later be cleaned up as "April 23/24, 1966"
$regex_list_base = '@' . $part_lc_general . $part_c_list_separator . $part_lc_general . '( *(?:, *)?)' . $part_lc_y . '@i';
$replace_list_base = '§~§~$1~$2~$3~' . '$4' . '§~§~$5~$6~$7~' . '$8' . '$9';
// Search for additional month-day or day-month parts to the left of already processed dates.
$regex_list_extend = '@' . $part_lc_general . $part_c_list_separator . '(?=§~§)' . '@i';
$replace_list_extend = '§~§~$1~$2~$3~' . '$4';
// Convert intermediate date list string replacements with final unlinked form ("month day", "day month" or "day").
$regex_list_cleanup_nopipe = '/' . '§~§~([^~]*)~([^~]*)~[|]?~' . '/i';
$replace_list_cleanup_nopipe = '$1 $2';
$regex_list_cleanup_pipetext = '/' . '§~§~([^~]*)~([^~]*)~[|]([^~]*)~' . '/i';
$replace_list_cleanup_pipetext = '$3'; // Discard link target, replace with pipe text only
// Remove any remaining placeholders.
$regex_cleanup_final = '/' . '§~§' . '/i';
$replace_cleanup_final = '';
//========== Begin search and replace ordinary dates
$match_count = 0;
$contents = preg_replace($regex_AMreg, $replace_AM, $contents, -1, &$match_count);
if ($match_count > 0) {
$editsummary .= "AMreg(×" . $match_count . "), ";
}
$match_count = 0;
$contents = preg_replace($regex_BRreg, $replace_BR, $contents, -1, &$match_count);
if ($match_count > 0) {
$editsummary .= "BRreg(×" . $match_count . "), ";
}
$match_count = 0;
$contents = preg_replace($regex_AModd, $replace_AM, $contents, -1, &$match_count);
if ($match_count > 0) {
$editsummary .= "AModd(×" . $match_count . "), ";
}
$match_count = 0;
$contents = preg_replace($regex_BRodd, $replace_BR, $contents, -1, &$match_count);
if ($match_count > 0) {
$editsummary .= "BRodd(×" . $match_count . "), ";
}
$match_count = 0;
$contents = preg_replace($regex_YMD, $replace_YMD, $contents, -1, &$match_count);
if ($match_count > 0) {
$editsummary .= "YMD(×" . $match_count . "), ";
}
$match_count = 0;
$contents = preg_replace($regex_ISO1, $replace_ISO1, $contents, -1, &$match_count);
if ($match_count > 0) {
$editsummary .= "ISO1(×" . $match_count . "), ";
}
$match_count = 0;
$contents = preg_replace($regex_ISO2, $replace_ISO2, $contents, -1, &$match_count);
if ($match_count > 0) {
$editsummary .= "ISO2(×" . $match_count . "), ";
}
//========== Begin search and replace date lists
$date_list_count = 0;
// Search for two part date range or list where rightmost part did nor match one of the
// dmy or mdy patterns above. Will typically match "[[mmm dd]] - [[mmm dd|dd]], [[yyyy]]"
// forms or variations.
$match_count = 0;
$contents = preg_replace($regex_list_base, $replace_list_base, $contents, -1, &$match_count);
if ($match_count > 0) {
$editsummary .= "Lists1(×" . $match_count . "), ";
}
$date_list_count += $match_count;
// Process additional date parts to the left of a previously identified date or list.
for ($i = 0; $i < 10; $i++) {
$match_count = 0;
$contents = preg_replace($regex_list_extend, $replace_list_extend, $contents, -1, &$match_count);
if ($match_count == 0) break;
$date_list_count += $match_count;
}
if ($date_list_count > 0) {
$editsummary .= "Lists2(×" . $date_list_count . "), ";
}
// Finalize date list item format and remove any remaining marker strings ("§~§")
$contents = preg_replace($regex_list_cleanup_nopipe, $replace_list_cleanup_nopipe, $contents);
$contents = preg_replace($regex_list_cleanup_pipetext, $replace_list_cleanup_pipetext, $contents);
$contents = preg_replace($regex_cleanup_final, $replace_cleanup_final, $contents);
//========== Postprocessing
$editsummary = substr($editsummary, 0, -2); // to get rid of superfluous comma and space
overridecheck(); // checks if the manual override has been triggered
$sqlquery = query("INSERT INTO `unlinked` (`name`) VALUES (\"$link\")");
if (strlen($contents) == 0) {
echo "Contents blanked during processing of article \"$link\". Skipping save step.";
}
else if ($contents != $contents_archive) {
$return_code = $objwiki->edit($link,$contents,$editsummary,true,true,null,true); // posts the change. The two "true" parameters indicate that this is a bot edit and it is a minor edit
if ($return_code['error']['code']=='editconflict') {
echo 'Edit conflict detected....';
}
sleep(10);
}
}
$months = array("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December");
for ($i = 0; $i < count($months); $i++) { // for each month
$links = $objwiki->whatlinkshere($months[$i], "&blnamespace=0");
for ($j = 0; $j < count($links); $j++) {
echo "Checking " . $links[$j] . "\n";
if (checktoprocess($links[$j])) { // if the checktoprocess function returns true
echo $links[$j] . " shall be processed.\n";
unlinker($links[$j]);
}
else {
echo $links[$j] . " shall NOT be processed.\n";
}
}
for ($d = 1; $d < 32; $d++) { // This is like the above, except with different date combinations
echo "Checking backlinks to " . $month . " " . $d . "\n";
$links = $objwiki->whatlinkshere($months[$i] . " " . $d, "&blnamespace=0");
for ($j = 0; $j < count($links); $j++) {
echo "Checking " . $links[$j] . "\n";
if (checktoprocess($links[$j])) {
echo $links[$j] . " shall be processed.\n";
unlinker($links[$j]);
}
else {
echo $links[$j] . " shall NOT be processed.\n";
}
}
}
}
?>