Jump to content

User:SQL/ListGen.php

From Wikipedia, the free encyclopedia
<?php
/*
MIT License

Copyright (c) 2018 SQL at the English Wikipedia ( https://en.wikipedia.org/wiki/User:SQL )

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

Requirements:

* 150 Mainspace edits
** First pass filter to narrow down query: user_editcount > 149
*** Second pass via independant query?
**** Screw that, way to slow. We'll use the API.
* Registered by: October 28th, 2018
** Easy. user_registration > 20181028000000
*** Done
* Not Blocked
** Probably fastest to do api queries? Maybe tie into generator query
*** Yes, Done, but via subquery
* At least one edit this year (Not for eligibility, but for mass messaging)
** Probably best to do on it's own query
*** Select rev_id from revision_userindex where rev_user = 123 and rev_timestamp > 20171028000000 LIMIT 1;
**** Nope, done via subquery.

Generator with block checking: select count(user_name) from user where user_editcount > 149 and user_registration < 20181028000000 and user_id not in (select ipb_user from ipblocks where ipb_user = user_id);
At present, returns 163,947 users.

+------------------+
| count(user_name) |
+------------------+
|           163947 |
+------------------+
1 row in set (16.19 sec)

Without block filtering, returns 181,301 users.

+------------------+
| count(user_name) |
+------------------+
|           181301 |
+------------------+
1 row in set (43.01 sec)

10/11/18 - Possible TODO: Pull lists from [[Category:All Wikipedia bots]], [[Category:Wikipedia alternative accounts]], and [[Category:Wikipedia doppelganger accounts]] . Discard these accounts for notifications. This would net about -6558 accounts - probably a bit less, as there is likely some overlap between those cats.
* Done!
*/

/* Config */

$ml = 2; // Maxlag parameter
$ua = "SQLBot performing ace2018 analysis. See [[User:SQL]] on enwiki"; // User-Agent
$catquery = "select
  distinct page_title
from
  categorylinks
  join page on cl_from = page_id
where
  (
    cl_to = 'All_Wikipedia_bots'
    or cl_to = 'Wikipedia_alternative_accounts'
    or cl_to = 'Wikipedia_doppelganger_accounts'
  )
  and (
    page_namespace = 2
    or page_namespace = 3
  )
  and page_title not like '%/%';";
$init = "select
  user_name,
  user_id,
  user_editcount,
  user_registration,
  (
    select
      rev_timestamp
    from
      revision_userindex
    where
      rev_user = user_id
    order by
      rev_timestamp desc
    limit
      1
  ) as lastedit
from
  user
where
  user_editcount > 149
  and user_registration < 20181028000000;"; // Generator query

$ts_pw = posix_getpwuid(posix_getuid());
$ts_mycnf = parse_ini_file($ts_pw['dir'] . "/replica.my.cnf");

$mysqli = mysqli_connect( 'enwiki.analytics.db.svc.eqiad.wmflabs', $ts_mycnf['user'], $ts_mycnf['password'], 'enwiki_p' );

$userresult = mysqli_query( $mysqli, $catquery );

$dops = array();
while( $catrow = mysqli_fetch_array( $userresult, MYSQLI_ASSOC ) ) {
	array_push( $dops, $catrow['page_title'] );
}

$result = mysqli_query( $mysqli, $init );
$eligible_users = "";
$ineligible_users = "";
$message_users = "";
$rows = mysqli_num_rows( $result );
$current = 1;
$e = 0;
$i = 0;
$b = 0;
$m = 0;
$d = 0;
$dop = FALSE;
$discardeddops = "";
while( $row = mysqli_fetch_array( $result, MYSQLI_ASSOC ) ) {
	$status = "";
	$message = 0;
	$eligible = 0;
	$username = $row['user_name'];
	$userid = $row['user_id'];
	$userreg = $row['user_registration'];
	$busername = urlencode($username);
	$lastedit = $row['lastedit'];
	$dopuser = str_replace( " ", "_", $username );
	if( array_search( $dopuser, $dops ) === FALSE ) { $dop = FALSE; } else { $dop = TRUE; $discardeddops .= "$username\n"; $status .= "d"; $d++; }
	if( strtotime( $lastedit ) > time() - 31536000 ) {
		$message = 1;
	}
	$options = array(
		'http'=>array(
		'method'=>"GET",
		'header'=>"Accept-language: en\r\n" .
			"User-Agent: $ua\r\n" 
		)
	);
	$context = stream_context_create($options);
	$url = "https://en.wikipedia.org/w/api.php?action=query&list=usercontribs&uclimit=165&ucdir=newer&ucnamespace=0&format=json&ucprop=size&maxlag=$ml&ucuserids=$userid";
	$contribs = json_decode( file_get_contents( $url, false, $context ), TRUE );
	$maxlag = TRUE;
	while( $maxlag == TRUE ) {
		if( @$contribs['error']['code'] == "maxlag" ) {
			$maxlag = TRUE;
			$lag = $contribs['error']['lag'];
			echo "Hit maxlag! Sleeping $lag seconds.\n";
			sleep( $lag );
			$contribs = json_decode( file_get_contents( $url, false, $context ), TRUE );
		} else {
			$maxlag = FALSE;
		}
	}
	$edits = count( $contribs['query']['usercontribs'] );
	if( $edits >= 150 ) { $eligible = 1; $status .= "e"; $eligible_users .= "$username\n"; $e++; } else { $status .= "i"; $ineligible_users .= "$username\n"; $i++; }
	if( $message == 1 && $eligible == 1 && $dop === FALSE ) {
		$apibase = file_get_contents( "https://en.wikipedia.org/w/api.php?action=query&list=blocks&bkprop=expiry&bkusers=$busername&format=json", false, $context );
		$apiblocks = json_decode($apibase , TRUE );
		@$bexp = $apiblocks['query']['blocks'][0]['expiry'];
			if( $bexp != "infinity" ) {
				$message_users .= "* [[User_talk:$username]]\n"; $status .= "m"; $m++; 
			} else {
				$status .= "b";
				$b++;
			}
	}
	$pct = number_format( round( ( $current / $rows ) * 100, 2 ), 2 ) . "%";
	echo "[$current/$rows/$pct][e:$e/m:$m/i:$i/d:$d/b:$b][$status]: $username UID: $userid - Reg: $userreg - Lastedit = $lastedit - NS0 Edits: $edits\n";
	$current++;
}
file_put_contents( "/data/project/aivanalysis/ace2018/message_users", $message_users );
file_put_contents( "/data/project/aivanalysis/ace2018/eligible_users", $eligible_users );
file_put_contents( "/data/project/aivanalysis/ace2018/ineligible_users", $ineligible_users );
file_put_contents( "/data/project/aivanalysis/ace2018/discardeddops", $discardeddops );
?>