User:IPLRecordsUpdateBot/Version 1.0 source/CricinfoDataParser.php
Appearance
<?php
# Parser for ESPNCricinfo stats to be used by IPLRecordsUpdateBot
class CricinfoDataParser {
private $pageDOM;
public $loadFailed = false;
# Loads the data from the given URI
public function load($uri) {
$html = file_get_contents($uri);
sleep(15); # Per robots.txt
# Set loadFailed to true if an error occured
if ( $html === false ) {
$this->loadFailed = true;
trigger_error("CricinfoDataParser: Unable to load data (URI: {$uri})", E_USER_WARNING);
return;
}
$this->pageDOM = new DOMDocument();
@($this->pageDOM->loadHTML($html));
$this->pageDOM->preserveWhiteSpace = false;
}
const SORT_ASCENDING = 0,
SORT_DESCENDING = 1;
public function parse($tableIndex, $parseOrder, $parseLinks = false, $limit = null, $sortOrder = null,
$sortMode = CricinfoDataParser::SORT_DESCENDING, $sortReverse = [], $filter = null) {
/*
DESCRIPTION OF PARAMETERS
- tableIndex: the position of the table which is to be parsed in the HTML (0 is the first)
- parseOrder: an array containing the names of each column in the result. (index 0 is the first column)
- parseLinks: whether the href attributes of links should be parsed.
- limit: the maximum number of rows to return in the result, null for no limit.
- sortOrder: the order in which the result is to be sorted. No sorting if this is not an array.
- sortMode: CricinfoDataParser::SORT_ASCENDING for ascending sort or CricinfoDataParser::SORT_DESCENDING for descending sort
- sortReverse: Array containing column names to be sorted in reverse order
- filter: callback function to remove rows which do not fulfill a certain condition.
should take one parameter (the row data) and return true (which keeps the row) or false (which deletes it)
*/
$xpath = new DOMXPath($this->pageDOM);
# Select all the rows from the table with class="engineTable" at $tableIndex
$tableList = $xpath->query("//table[@class = 'engineTable']");
$table = $tableList->item($tableIndex);
$rows = $xpath->query("tbody/tr[starts-with(@class, 'data')]", $table);
$notes = $xpath->query("tbody/tr[@class = 'note']", $table);
$result = [];
# Parse the result
for ( $i = 0; $i < $rows->length; $i++ ) {
$rowData = [];
foreach ( $parseOrder as $pos => $name ) {
$rowData[$name] = $rows->item($i)->getElementsByTagName('td')->item($pos)->nodeValue;
# If $parseLinks is true, take the href attributes of links and parse them
if ( $parseLinks && $rows->item($i)->getElementsByTagName('td')->item($pos)->getElementsByTagName('a')->length ) {
$rowData[$name . ':href'] = $rows->item($i)->getElementsByTagName('td')->item($pos)->getElementsByTagName('a')->item(0)->getAttribute('href');
}
# Parse the associated row with class="note"
$rowData['NOTE'] = ($notes->length > 0) ? $notes->item($i)->getElementsByTagName('td')->item(0)->nodeValue : null; # To avoid a fatal error (calling a method on a non-object), should checked whether there are notes first.
}
$result[] = $rowData;
unset($rowData);
}
# If a callback function is passed to $filter, filter the array
if ( is_callable($filter) ) {
$result = array_values(array_filter($result, $filter));
}
# Sort the result if a sort order is given
if ( is_array($sortOrder) ) {
usort( $result,
function($data1, $data2) use ($sortOrder, $sortReverse) {
# These functions are assigned to variables as they are not needed in the global scope
$sortName = function($arg1, $arg2, $uri1, $uri2) {
$table = $GLOBALS['CricinfoPlayerNameTranslationTable'];
# Set the args to the IDs if they are present in the table
if ( $uri1 && $uri2 ) {
preg_match('%/content(?:/.*?)?/player/(\d+)\.html$%', $uri1, $idmatch);
$arg1 = isset($table[$idmatch[1]]) ? $idmatch[1] : $arg1;
preg_match('%/content(?:/.*?)?/player/(\d+)\.html$%', $uri2, $idmatch);
$arg2 = isset($table[$idmatch[1]]) ? $idmatch[1] : $arg2;
}
$sortKey1 = @$table[$arg1]['sort'] ?: (@$table[$arg1]['last'] . ', ' . @$table[$arg1]['first']);
$sortKey2 = @$table[$arg2]['sort'] ?: (@$table[$arg2]['last'] . ', ' . @$table[$arg2]['first']);
return ($sortKey1 == $sortKey2) ? 0 : (($sortKey1 > $sortKey2) ? 1 : -1);
};
$sortScoresAndBowlingFigures = function($arg1, $arg2, $implict10 = false) {
$arg1 = explode('/', $arg1);
$arg2 = explode('/', $arg2);
# In innings scores, if only the runs are mentioned it implies 10 wickets
$arg1[1] = isset($arg1[1]) ? $arg1[1] : ($implict10 ? 10 : 0);
$arg2[1] = isset($arg2[1]) ? $arg2[1] : ($implict10 ? 10 : 0);
if ( $arg1[0] != $arg2[0] ) {
return ($arg1[0] > (int) $arg2[0]) ? 1 : -1;
}
if ( $arg1[1] != $arg2[1] ) {
return ($arg1[1] < (int) $arg2[1]) ? 1 : -1; # If two bowlers took the same number of wickets, the one with LESS runs is better
# Similarly if two teams scored the same runs, the one with LESS wickets is better
}
return 0;
};
foreach ( $sortOrder as $name ) {
if ( $name == 'C_NAME' ) { # Use the sortName function if the arguments are names
$sort = call_user_func($sortName, $data1[$name], $data2[$name], @$data1[$name . ':href'], @$data2[$name. ':href']);
if ( $sort != 0 ) {
return $sort * (in_array($name, $sortReverse) ? -1 : 1);
}
}
if ( $name == 'C_SCORE' || $name == 'C_BEST_BOWLING' ) { # Use the sortScoresAndBowlingFigures function if the arguments are scores or bowling figures
$sort = call_user_func($sortScoresAndBowlingFigures, $data1[$name], $data2[$name], $name == 'C_SCORE');
if ( $sort != 0 ) {
return $sort * (in_array($name, $sortReverse) ? -1 : 1);
}
}
if ( $name == 'C_DATE' ) { # If the arguments are dates, sort by their Unix timestamps
$time1 = strtotime($data1[$name]);
$time2 = strtotime($data2[$name]);
$sort = ($time1 == $time2) ? 0 : (($time1 > $time2) ? 1 : -1);
if ( $sort != 0 ) {
return $sort * (in_array($name, $sortReverse) ? -1 : 1);
}
}
if ( $data1[$name] != $data2[$name] ) { # Otherwise use numerical comparison (or string comparison if the second argument is not a number)
return ($data1[$name] > (preg_match('/^\d+?/', $data2[$name]) ? ((float) $data2[$name]) : $data2[$name]) ? 1 : -1) * (in_array($name, $sortReverse) ? -1 : 1);
}
}
return 0;
}
);
if ( is_array($sortOrder) && $sortMode == CricinfoDataParser::SORT_DESCENDING ) {
$result = array_reverse($result);
}
}
# Limit the result to $limit
$result = array_slice($result, 0, $limit);
# Return the final result
return $result;
}
}
?>