User:Nn123645/christian list parse.php
Appearance
<?php
//PHP Settings
set_time_limit(0);
ini_set( 'memory_limit', '500M' ); //set memory_limit to 500 MB of RAM so we don't run out
//Require Parsed Category Names
require 'cats.txt';
require 'privateconfig.php'; //we define the mysql variables here
//New Instance of MySQLi
$mysql = new mysqli ( $mysql_server, $mysql_user, $mysql_pass, $mysql_dbname );
unset( $mysql_server, $mysql_pass, $mysql_server, $mysql_user );
if ( $mysql->connect_error ){
die('Connect Error (' . $mysql->connect_errno . ') ' . $mysql->connect_error());
}
$dump = simplexml_load_file('Wikipedia-20090330211613.xml') or die('problem');
foreach ( $dump->page as $page ) {
$revision = $page->revision->text;
$revision = $revision[0];
if ( preg_match_all( '%\{\{(?i:Birthdeath|Lifetime)\|([\d]{1,3})(?:[^\}]*)\}\}%', $revision, $template_param ) ) {
//insert the page into the matches as we have found a page that belongs in a cat
foreach( $template_param[1] as $template_param ) {
$template_param = (int) $template_param;
if ( $template_param >= 2 && $template_param <= 19 ) {
++$template_param;
$result = $mysql->query( 'SELECT page_id FROM pages WHERE page_name = \'' . $mysql->real_escape_string($page->title) . '\' LIMIT 1;' ) or die('problem, sql');
$article_id = $result->fetch_row();
$result->close();
$result = $mysql->query( "SELECT match_id FROM matches WHERE page_id = '$article_id[0]' AND list_match = '$template_param' LIMIT 1;" ) or die( '(' . $mysql->errno . ') ' . $mysql->error );
if ( $result->num_rows == 0 ) {
$mysql->query( "INSERT INTO matches (list_match, page_id) VALUES ('$template_param', '$article_id[0]' );" ) or die('problem, sql');
echo "added query1\n";
}
} elseif ( $template_param == 200 ) {
$result = $mysql->query( 'SELECT page_id FROM pages WHERE page_name = \'' . $mysql->real_escape_string($page->title) . '\' LIMIT 1;' ) or die('problem, sql');
$article_id = $result->fetch_row();
$result->close();
$result = $mysql->query( "SELECT match_id FROM matches WHERE page_id = '$article_id[0]' AND list_match = '$template_param' LIMIT 1;" ) or die( '(' . $mysql->errno . ') ' . $mysql->error );
if ( $result->num_rows == 0 ) {
$mysql->query( "INSERT INTO matches (list_match, page_id) VALUES ('21', '$article_id[0]' );" ) or die('problem, sql');
echo "added query2\n";
}
}
}
preg_match_all( '%\[\[Category:([^\]]+)\]\]%', $revision, $category_match);
foreach ( $category_match[1] as $match ) {
foreach ( $categories as $century => $category_ar ) {
foreach ( $category_ar as $category_name ) {
if ( strcasecmp( $match, $category_name ) == 0 ) {
//We have a match
//echo $century . "\n";
$result = $mysql->query( 'SELECT page_id FROM pages WHERE page_name = \'' . $mysql->real_escape_string($page->title) . '\' LIMIT 1;' ) or die('problem, sql');
$article_id = $result->fetch_row();
$result->close();
$result = $mysql->query( "SELECT match_id FROM matches WHERE page_id = '$article_id[0]' AND list_match = '$century' LIMIT 2;" ) or die('problem, sql');
if ( $result->num_rows == 0 ) {
$mysql->query( "INSERT INTO matches (list_match, page_id) VALUES ('$century', '$article_id[0]' );" ) or die( '(' . $mysql->errno . ') ' . $mysql->error );
echo "added query3\n";
}
}
}
}
}
} elseif ( preg_match_all( '%\[\[Category:([^\]]+)\]\]%', $revision, $category_match) ) {
foreach ( $category_match[1] as $match ) {
foreach ( $categories as $century => $category_ar ) {
foreach ( $category_ar as $category_name ) {
if ( strcasecmp( $match, $category_name ) == 0 ) {
//We have a match
//echo $century . "\n";
$result = $mysql->query( 'SELECT page_id FROM pages WHERE page_name = \'' . $mysql->real_escape_string($page->title) . '\' LIMIT 1;' ) or die('problem, sql');
$article_id = $result->fetch_row();
$result->close();
$result = $mysql->query( "SELECT match_id FROM matches WHERE page_id = '$article_id[0]' AND list_match = '$century' LIMIT 2;" ) or die('problem, sql');
if ( $result->num_rows == 0 ) {
$mysql->query( "INSERT INTO matches (list_match, page_id) VALUES ('$century', '$article_id[0]' );" ) or die( '(' . $mysql->errno . ') ' . $mysql->error );
echo "added query4\n";
}
}
}
}
}
}
}
// Testing Section
/*
$revision = 'Hi my name is foo
I like to say foobar
foo is good
[[Category:Foobar]]
[[Category:Current national leaders]]
[[Category:14th-century Christian clergy]]';*/
// This portion was used for parsing the lists into something we can use
//$page = file_get_contents( 'input.txt' ) or die ('problem');
//$page = file_get_contents( 'criterion.txt' ) or die ('problem');
/* This was to parse the categories on the page
$page = explode( '===', $page );
$fp = fopen ( 'cats.txt', 'w' );
fwrite( $fp, "<?php\n\n" . '$categorys = array (' . "\n");
$i = 22;
foreach ( $page as $page ) {
$matches = array();
if( preg_match_all( '%\[\[:Category:([^\]]+)\]\]%', $page, $matches) ) {
if( $i == 22 ) {
fwrite( $fp, "\t'LP' => array(\n" );
--$i;
} else {
fwrite( $fp, "\t'$i' => array(\n" );
--$i;
}
foreach ( $matches[1] as $match ) {
$match = addslashes($match);
fwrite($fp, "\t\t'$match',\n");
}
fwrite( $fp, "\t),\n");
}
}
fwrite( $fp, ");\n\n?>");
fclose( $fp );
*/
/* This was to insert the list of pages into the database
foreach ( $match as $match ) {
//$mysql->query('INSERT INTO pages ( page_name ) VALUES ( "' . $mysql->real_escape_string( $match ) . '");');
}
*/
?>