Wikipedia:WikiProject Interlanguage Links/Scripts

From Wikipedia, the free encyclopedia

Into a mysql database load the page and langlinks database dump files from two or more wikipedia Wikipedia:Database_download.

Extract interlanguage links into a single table for anaylysis


CREATE TABLE inter (
     from_lang    varchar(10),
     from_title   varchar(255),
     to_lang      varchar(10),
     to_title     varchar(255)
);
USE en;

INSERT INTO inter.inter
SELECT 'en', page.page_title, langlinks.ll_lang, langlinks.ll_title
FROM page, langlinks
WHERE page.page_id = langlinks.ll_from
AND   page.page_namespace = 0;

USE de;

INSERT INTO inter.inter
SELECT 'de', page.page_title, langlinks.ll_lang, langlinks.ll_title
FROM page, langlinks
WHERE page.page_id = langlinks.ll_from
AND   page.page_namespace = 0;

... etc

Tidy up this table and index it:


update inter set from_title = replace( from_title, '_', ' ' );
update inter set to_title = replace( to_title, '_', ' ' );

// Clear nonsensical entries
 -- entries to nonsensical languages
 -- entries to blank titles
 -- entries to unlikely titles
 -- poss entries to nonexistant articles ?

ALTER TABLE inter ADD INDEX ( from_lang, from_title );
ALTER TABLE inter ADD INDEX ( to_lang, to_title );

Extract the information required, ie:


// suggested reciporcal links
// a:x -> b:x and b:x exists and no link from b:? to a:x exists => b:x -> a:x

INSERT INTO suggestions
SELECT a.to_lang, a.to_title, a.from_lang, a.from_title
FROM inter a
INNER JOIN en.page p
    ON p.page_title = a.to_title
    AND p.page_namespace = 0
    AND p.page_is_redirect = 0
LEFT JOIN inter b
    ON   b.from_lang = a.to_lang
    AND  b.from_title = a.to_title     
    AND  b.to_lang = a.from_lang
WHERE a.from_lang IN ('de','es','fr','it','nl','ja','pl','pt','sv' )
AND   a.to_lang = 'en'
AND   b.from_lang IS NULL;

// Interlanguage links to pages that do not exist
// en -> fr only
INSERT INTO suggestions
SELECT a.from_lang, a.from_title, a.to_lang, a.to_title
FROM inter a
INNER JOIN en.page f
    ON f.page_title = a.from_title
    AND f.page_namespace = 0
    AND f.page_is_redirect = 0
LEFT JOIN fr.page t
    ON   t.page_title = a.to_title
	AND  t.page_namespace = 0
WHERE a.from_lang = 'en'
AND   a.to_lang = 'fr'
AND   t.page_title IS NULL


// Interlanguage links to redirects
// en -> fr only

SELECT a.from_lang, a.from_title, a.to_lang, a.to_title
FROM inter a
INNER JOIN en.page f
    ON f.page_title = a.from_title
    AND f.page_namespace = 0
    AND f.page_is_redirect = 0
INNER JOIN fr.page t
    ON   a.to_title = t.page_title
	AND  t.page_namespace = 0
	AND  t.page_is_redirect = 1
WHERE a.from_lang = 'en'
AND   a.to_lang = 'fr';

Finally, extract these suggestions in a human-readable format


DROP PROCEDURE IF EXISTS report_suggestions;
DELIMITER //

CREATE PROCEDURE report_suggestions( group_size INT, flang VARCHAR(10), tlang VARCHAR(10) )
BEGIN
   DECLARE sug_pos, sug_base, done INT;
   DECLARE ftitle, ttitle VARCHAR(255);
   DECLARE sug CURSOR FOR SELECT DISTINCT from_title, to_title FROM suggestions WHERE from_lang = flang AND to_lang = tlang;
   DECLARE CONTINUE HANDLER FOR SQLSTATE '23000' SET done = 1;

   SET sug_pos = 0;
   SET sug_base = 0;
   SET done = 0;
   OPEN sug;

   REPEAT

     FETCH sug INTO ftitle, ttitle;

     IF NOT done THEN
       IF sug_pos = 0 THEN
          SELECT concat( '=== ', sug_base, ' - ', sug_base + group_size - 1, ' ===' );
          SET sug_base = sug_base + group_size;
          SET sug_pos = group_size - 1;
       ELSE
          SET sug_pos = sug_pos - 1;
       END IF;

       SELECT concat( '*[[', ftitle, ']] → [[:', tlang, ':', ttitle, ']]' );
     END IF;

  UNTIL done END REPEAT;
  
  CLOSE sug;
END;
//

DELIMITER ;

call report_suggestions( 10, 'en', 'sv' );