Jump to content

User:IPLRecordsUpdateBot/Version 1.0 source/CricinfoDataParser.php

From Wikipedia, the free encyclopedia
<?php

# Parser for ESPNCricinfo stats to be used by IPLRecordsUpdateBot

class CricinfoDataParser {

    private $pageDOM;
    public $loadFailed = false;

    # Loads the data from the given URI
    public function load($uri) {
        $html = file_get_contents($uri);
        sleep(15);  # Per robots.txt

        # Set loadFailed to true if an error occured
        if ( $html === false ) {
            $this->loadFailed = true;
            trigger_error("CricinfoDataParser: Unable to load data (URI: {$uri})", E_USER_WARNING);
            return;
        }

        $this->pageDOM = new DOMDocument();
        @($this->pageDOM->loadHTML($html));
        $this->pageDOM->preserveWhiteSpace = false;

    }

    const SORT_ASCENDING = 0,
          SORT_DESCENDING = 1;

    public function parse($tableIndex, $parseOrder, $parseLinks = false, $limit = null, $sortOrder = null,
                          $sortMode = CricinfoDataParser::SORT_DESCENDING, $sortReverse = [], $filter = null) {

        /*
            DESCRIPTION OF PARAMETERS
            - tableIndex: the position of the table which is to be parsed in the HTML (0 is the first)
            - parseOrder: an array containing the names of each column in the result. (index 0 is the first column)
            - parseLinks: whether the href attributes of links should be parsed.
            - limit: the maximum number of rows to return in the result, null for no limit.
            - sortOrder: the order in which the result is to be sorted. No sorting if this is not an array.
            - sortMode: CricinfoDataParser::SORT_ASCENDING for ascending sort or CricinfoDataParser::SORT_DESCENDING for descending sort
            - sortReverse: Array containing column names to be sorted in reverse order
            - filter: callback function to remove rows which do not fulfill a certain condition.
                      should take one parameter (the row data) and return true (which keeps the row) or false (which deletes it)
        */

        $xpath = new DOMXPath($this->pageDOM);

        # Select all the rows from the table with class="engineTable" at $tableIndex
        $tableList = $xpath->query("//table[@class = 'engineTable']");
        $table = $tableList->item($tableIndex);
        $rows = $xpath->query("tbody/tr[starts-with(@class, 'data')]", $table);
        $notes = $xpath->query("tbody/tr[@class = 'note']", $table);

        $result = [];

        # Parse the result
        for ( $i = 0; $i < $rows->length; $i++ ) {

            $rowData = [];

            foreach ( $parseOrder as $pos => $name ) {
                $rowData[$name] = $rows->item($i)->getElementsByTagName('td')->item($pos)->nodeValue;

                # If $parseLinks is true, take the href attributes of links and parse them
                if ( $parseLinks && $rows->item($i)->getElementsByTagName('td')->item($pos)->getElementsByTagName('a')->length ) {
                    $rowData[$name . ':href'] = $rows->item($i)->getElementsByTagName('td')->item($pos)->getElementsByTagName('a')->item(0)->getAttribute('href');
                }

                # Parse the associated row with class="note"
                $rowData['NOTE'] = ($notes->length > 0) ? $notes->item($i)->getElementsByTagName('td')->item(0)->nodeValue : null;  # To avoid a fatal error (calling a method on a non-object), should checked whether there are notes first.

            }

            $result[] = $rowData;
            unset($rowData);

        }

        # If a callback function is passed to $filter, filter the array
        if ( is_callable($filter) ) {
            $result = array_values(array_filter($result, $filter));
        }

        # Sort the result if a sort order is given
        if ( is_array($sortOrder) ) {
            usort(  $result,
                    function($data1, $data2) use ($sortOrder, $sortReverse) {

                        # These functions are assigned to variables as they are not needed in the global scope
                        $sortName = function($arg1, $arg2, $uri1, $uri2) {
                            $table = $GLOBALS['CricinfoPlayerNameTranslationTable'];

                            # Set the args to the IDs if they are present in the table
                            if ( $uri1 && $uri2 ) {
                                preg_match('%/content(?:/.*?)?/player/(\d+)\.html$%', $uri1, $idmatch);
                                $arg1 = isset($table[$idmatch[1]]) ? $idmatch[1] : $arg1;
                                preg_match('%/content(?:/.*?)?/player/(\d+)\.html$%', $uri2, $idmatch);
                                $arg2 = isset($table[$idmatch[1]]) ? $idmatch[1] : $arg2;
                            }

                            $sortKey1 = @$table[$arg1]['sort'] ?: (@$table[$arg1]['last'] . ', ' . @$table[$arg1]['first']);
                            $sortKey2 = @$table[$arg2]['sort'] ?: (@$table[$arg2]['last'] . ', ' . @$table[$arg2]['first']);
                            return ($sortKey1 == $sortKey2) ? 0 : (($sortKey1 > $sortKey2) ? 1 : -1);
                        };

                        $sortScoresAndBowlingFigures = function($arg1, $arg2, $implict10 = false) {
                            $arg1 = explode('/', $arg1);
                            $arg2 = explode('/', $arg2);

                            # In innings scores, if only the runs are mentioned it implies 10 wickets
                            $arg1[1] = isset($arg1[1]) ? $arg1[1] : ($implict10 ? 10 : 0);
                            $arg2[1] = isset($arg2[1]) ? $arg2[1] : ($implict10 ? 10 : 0);

                            if ( $arg1[0] != $arg2[0] ) {
                                return ($arg1[0] > (int) $arg2[0]) ? 1 : -1;
                            }
                            if ( $arg1[1] != $arg2[1] ) {
                                return ($arg1[1] < (int) $arg2[1]) ? 1 : -1;  # If two bowlers took the same number of wickets, the one with LESS runs is better
                                                                              # Similarly if two teams scored the same runs, the one with LESS wickets is better
                            }
                            return 0;
                        };

                        foreach ( $sortOrder as $name ) {
                            if ( $name == 'C_NAME' ) {  # Use the sortName function if the arguments are names
                                $sort = call_user_func($sortName, $data1[$name], $data2[$name], @$data1[$name . ':href'], @$data2[$name. ':href']);
                                if ( $sort != 0 ) {
                                    return $sort * (in_array($name, $sortReverse) ? -1 : 1);
                                }
                            }
                            if ( $name == 'C_SCORE' || $name == 'C_BEST_BOWLING' ) {  # Use the sortScoresAndBowlingFigures function if the arguments are scores or bowling figures
                                $sort = call_user_func($sortScoresAndBowlingFigures, $data1[$name], $data2[$name], $name == 'C_SCORE');
                                if ( $sort != 0 ) {
                                    return $sort * (in_array($name, $sortReverse) ? -1 : 1);
                                }
                            }
                            if ( $name == 'C_DATE' ) {  # If the arguments are dates, sort by their Unix timestamps
                                $time1 = strtotime($data1[$name]);
                                $time2 = strtotime($data2[$name]);
                                $sort = ($time1 == $time2) ? 0 : (($time1 > $time2) ? 1 : -1);
                                if ( $sort != 0 ) {
                                    return $sort * (in_array($name, $sortReverse) ? -1 : 1);
                                }
                            }
                            if ( $data1[$name] != $data2[$name] ) {  # Otherwise use numerical comparison (or string comparison if the second argument is not a number)
                                return ($data1[$name] > (preg_match('/^\d+?/', $data2[$name]) ? ((float) $data2[$name]) : $data2[$name]) ? 1 : -1) * (in_array($name, $sortReverse) ? -1 : 1);
                            }
                        }

                        return 0;
                    }
            );

            if ( is_array($sortOrder) && $sortMode == CricinfoDataParser::SORT_DESCENDING ) {
                $result = array_reverse($result);
            }

        }

        # Limit the result to $limit
        $result = array_slice($result, 0, $limit);

        # Return the final result
        return $result;
    }

}

?>