Jump to content

User:Nn123645/christian list parse.php

From Wikipedia, the free encyclopedia
<?php
//PHP Settings
set_time_limit(0);
ini_set( 'memory_limit', '500M' ); //set memory_limit to 500 MB of RAM so we don't run out

//Require Parsed Category Names
require 'cats.txt';
require 'privateconfig.php'; //we define the mysql variables here

//New Instance of MySQLi
$mysql = new mysqli ( $mysql_server, $mysql_user, $mysql_pass, $mysql_dbname );
unset( $mysql_server, $mysql_pass, $mysql_server, $mysql_user );

if ( $mysql->connect_error ){
	die('Connect Error (' . $mysql->connect_errno . ') ' . $mysql->connect_error());
}

$dump = simplexml_load_file('Wikipedia-20090330211613.xml') or die('problem');

foreach ( $dump->page as $page ) {
	$revision = $page->revision->text;
	$revision = $revision[0];
	if ( preg_match_all( '%\{\{(?i:Birthdeath|Lifetime)\|([\d]{1,3})(?:[^\}]*)\}\}%', $revision, $template_param ) ) {
		//insert the page into the matches as we have found a page that belongs in a cat
		
		foreach( $template_param[1] as $template_param ) {
			$template_param = (int) $template_param;
			if ( $template_param >= 2 && $template_param <= 19 ) {
				++$template_param;			
				$result = $mysql->query( 'SELECT page_id FROM pages WHERE page_name = \'' . $mysql->real_escape_string($page->title) . '\' LIMIT 1;' ) or die('problem, sql');
				$article_id = $result->fetch_row();
				$result->close();
				$result = $mysql->query( "SELECT match_id FROM matches WHERE page_id = '$article_id[0]' AND list_match = '$template_param' LIMIT 1;" ) or die( '(' . $mysql->errno . ') ' . $mysql->error );
				if ( $result->num_rows == 0 ) {
					$mysql->query( "INSERT INTO matches (list_match, page_id) VALUES ('$template_param', '$article_id[0]' );" ) or die('problem, sql');
					echo "added query1\n";
				}
			} elseif ( $template_param == 200 ) {
				$result = $mysql->query( 'SELECT page_id FROM pages WHERE page_name = \'' . $mysql->real_escape_string($page->title) . '\' LIMIT 1;' ) or die('problem, sql');
				$article_id = $result->fetch_row();
				$result->close();
				$result = $mysql->query( "SELECT match_id FROM matches WHERE page_id = '$article_id[0]' AND list_match = '$template_param' LIMIT 1;" ) or die( '(' . $mysql->errno . ') ' . $mysql->error );
				if ( $result->num_rows == 0 ) {
					$mysql->query( "INSERT INTO matches (list_match, page_id) VALUES ('21', '$article_id[0]' );" ) or die('problem, sql');
					echo "added query2\n";
				}
			}
		}
		
		preg_match_all( '%\[\[Category:([^\]]+)\]\]%', $revision, $category_match);
		foreach ( $category_match[1] as $match ) {
			foreach ( $categories as $century => $category_ar ) {
				foreach ( $category_ar as $category_name ) {
					if ( strcasecmp( $match, $category_name ) == 0 ) {
						//We have a match
						//echo $century . "\n";
						$result = $mysql->query( 'SELECT page_id FROM pages WHERE page_name = \'' . $mysql->real_escape_string($page->title) . '\' LIMIT 1;' ) or die('problem, sql');
						$article_id = $result->fetch_row();
						$result->close();
						$result = $mysql->query( "SELECT match_id FROM matches WHERE page_id = '$article_id[0]' AND list_match = '$century' LIMIT 2;" ) or die('problem, sql');
						if ( $result->num_rows == 0 ) {
							$mysql->query( "INSERT INTO matches (list_match, page_id) VALUES ('$century', '$article_id[0]' );" ) or die( '(' . $mysql->errno . ') ' . $mysql->error );
							echo "added query3\n";
						}
					}
				}
			}
		}
	} elseif ( preg_match_all( '%\[\[Category:([^\]]+)\]\]%', $revision, $category_match) ) {
		foreach ( $category_match[1] as $match ) {
			foreach ( $categories as $century => $category_ar ) {
				foreach ( $category_ar as $category_name ) {
					if ( strcasecmp( $match, $category_name ) == 0 ) {
						//We have a match
						//echo $century . "\n";
						$result = $mysql->query( 'SELECT page_id FROM pages WHERE page_name = \'' . $mysql->real_escape_string($page->title) . '\' LIMIT 1;' ) or die('problem, sql');
						$article_id = $result->fetch_row();
						$result->close();
						$result = $mysql->query( "SELECT match_id FROM matches WHERE page_id = '$article_id[0]' AND list_match = '$century' LIMIT 2;" ) or die('problem, sql');
						if ( $result->num_rows == 0 ) {
							$mysql->query( "INSERT INTO matches (list_match, page_id) VALUES ('$century', '$article_id[0]' );" ) or die( '(' . $mysql->errno . ') ' . $mysql->error );
							echo "added query4\n";
						}
					}
				}
			}
		}
	}
}



// Testing Section

/*
$revision = 'Hi my name is foo
I like to say foobar
foo is good
[[Category:Foobar]]
[[Category:Current national leaders]]
[[Category:14th-century Christian clergy]]';*/

// This portion was used for parsing the lists into something we can use
 
//$page = file_get_contents( 'input.txt' ) or die ('problem');
//$page = file_get_contents( 'criterion.txt' ) or die ('problem'); 

/* This was to parse the categories on the page
$page = explode( '===', $page );

$fp = fopen ( 'cats.txt', 'w' );
fwrite( $fp, "<?php\n\n" . '$categorys = array (' . "\n");

$i = 22;
foreach ( $page as $page ) {
	$matches = array();
	
	if( preg_match_all( '%\[\[:Category:([^\]]+)\]\]%', $page, $matches) ) {
		if( $i == 22  ) {
			fwrite( $fp, "\t'LP' => array(\n" );
			--$i;
		} else {
			fwrite( $fp, "\t'$i' => array(\n" );
			--$i;
		}
	
		foreach ( $matches[1] as $match ) {
			$match = addslashes($match);
			fwrite($fp, "\t\t'$match',\n");
		}

		fwrite( $fp, "\t),\n");
	}
}
fwrite( $fp, ");\n\n?>");
fclose( $fp );
*/

/* This was to insert the list of pages into the database 
foreach ( $match as $match ) {
	//$mysql->query('INSERT INTO pages ( page_name ) VALUES ( "' . $mysql->real_escape_string( $match ) . '");');
}
*/

?>