User:Jediarchives11/Wikification

From Wikipedia, the free encyclopedia

The code below is for the Automatic Wikification Extension. This extension searches articles when they are saved for words or phrases that have an article and then links them to that article.

<?php
/*
  Wikification Extension
  Gregory Szorc <gregory.szorc@case.edu>
  Requested by and edited by Nicholas Anderson <jediarchives11@gmail.com>
  
  This extension is a hook for MediaWiki that examines an article before it is
  committed to the database and looks for possible wiki topics in the article
  that are not marked as links and converts them.
  
  Changelog
    2005-07-25:  Work started
    2006-01-06:  Fixed Bug: When adding links, spaces would be removed
    2006-01-07:  $excludelist array added

  To Do
    *Fix bug: Commas and periods aren't removed when finding things to link
    *Fix bug: Last word in an article never links
*/

//when searching for phrases (like "History of Greece"), up to how many words
//should we search?
//the higher this number, the slower the extension
$wikifiPhraseWordLimit = 4;

//when searching for a single word term, what is the minimum number of characters
//allowed for a word
//this value is ignored for phrases
$wikifiMinWordLength = 3;

//namespaces to search for matches
//should have insignificant performance impact
$wikifiSearchNamespaces = array(NS_MAIN);


//when true, the only words that will be searched for matches are capitalized words
$wikifiOnlyCheckProper = false;

$wgExtensionFunctions[] = "Wikification_Wikify";

//register this hook
function Wikification_Wikify() {
  global $wgHooks;
  
  $wgHooks['ArticleSave'][] = 'Wikification_Save';
}

//this is the function that does the work
//all variables are passed by reference
function Wikification_Save($article, $user, $text) {
  global $wikifiPhraseWordLimit, $wikifiMinWordLength, $wikifiSearchNamespaces;
  global $wikifiOnlyCheckProper;
  
  //grab the database reference
  $db = &wfGetDB(DB_MASTER);
  
  //first we need to strip out things that should never be links
  
  //strip out existing wiki links [[*]] [*]
  $s = preg_replace("/\\[\\[.*?\\]\\]/", '', $text);
  $s = preg_replace("/\\[.*?\\]/", '', $s);
    
  //strip out section headers
  $s = preg_replace("/={1,5}.*?={1,5}/", '', $s); 
  
  //strip out other junk
  $s = preg_replace("/[.,]/","", $s);  

  $excludelist = array("about", "test", "spam blacklist test");
  $s = str_replace($excludelist, "", $s);

  
  //separate the text into words
  $words = explode(' ', $s);
  
  //remove any non-printable characters
  foreach ($words as $k=>$w) {
    $words[$k] = trim($w);
    if (strlen($w) == 0) {
      unset($words[$k]);
    }
  }
  
  //reindex the keys
  $words = array_values($words);
  
  $count = count($words);
  $search = array();
  
  $i = 0;
  foreach ($words as $k=>$v) {
    ++$i;
    
    //add an individual word if it is long enough
    if (strlen($v) >= $wikifiMinWordLength) {
      if ($wikifiOnlyCheckProper) {
        if (ctype_upper($v{0})) {
          $search[] = $v;
        }
      }
      else {
        $search[] = $v;
      }
    }
    
      for ($j = 1; $j < $wikifiPhraseWordLimit; $j++) {
        //if we have enough words left in the array
        if ( ($i + $j) < $count) {
          $phrase = $v;
          
          for ($l = 0; $l < $j; $l++) {
            $phrase .= ' ' . $words[$k+$l+1];
          }
          
          $search[] = $phrase;
        }
      }
    }


  //$search is an array for terms for which to search
  //we need to convert them to titles
  
  foreach ($search as $k=>$v) {
    $search[$k] = str_replace(' ', '_', ucwords($v));
  }
   
  //assemble what could be a massive sql query
  $sql = "SELECT page_namespace, page_title FROM wikipage";
  $sql .= " WHERE page_namespace IN (".implode(',', $wikifiSearchNamespaces).")";
  $sql .= " AND page_title IN (";
  
  foreach ($search as $v) {
    $sql .= "'".addslashes($v)."', ";
  }
  
  $sql = rtrim($sql, " ,");
  
  $sql .= ")";
  
  $result = $db->doQuery($sql);
  
  //if we found a match
  if ($db->numRows($result)) {
    //loop through all of the matches
    while ($row = $db->fetchRow($result)) {
      $namespace = $row['page_namespace'];
      $title = $row['page_title'];
      
      //start building the replacement text
      $link = " [[";
      
      switch ($namespace) {
        case NS_MAIN:
          break;
          
        //need to add prefixes in here
        case NS_TALK:
        case NS_USER:
        case NS_USER_TALK:
        case NS_PROJECT:
        case NS_PROJECT_TALK:
        case NS_IMAGE:
        case NS_IMAGE_TALK:
        case NS_MEDIAWIKI:
        case NS_MEDIAWIKI_TALK:
        case NS_TEMPLATE:
        case NS_TEMPLATE_TALK:
        case NS_HELP:
        case NS_HELP_TALK:
        case NS_CATEGORY:
        case NS_CATEGORY_TALK:
        
        default:
          break;
      }
      
      $link .= "$title|";
      
      //find the original text in the article
      $matches = array();
      $find = str_replace('_', ' ', $title);
      
      preg_match_all("/$find/i", $text, $matches);
      
      $matches = array_unique($matches[0]);
      
      foreach ($matches as $m) {
        $newlink = $link."$m]] ";
        
        //this regexp needs fine tuning        
        $text = preg_replace("/[^\[]$m\s/", $newlink, $text);
      }

      
    }
  }

  return true;
}


?>