Jump to content

User:BogBot/Source code/Task 02

From Wikipedia, the free encyclopedia
#!/usr/bin/python

# Bot Script to replace the opening sentence:
#    '''G protein-coupled receptor 3''', also known as '''GPR3''', is a human [[gene]].
# with:
#    '''G protein-coupled receptor 3''' is a [[protein]] that in humans is encoded by the ''GPR3'' [[gene]].

import re
import wikipedia

from Bio import Entrez
from Bio import Medline

Entrez.email = "boghog@mac.com"

months = {'01': "Jan", '02': "Feb", '03': "Mar", '04': "Apr", '05': "May", '06': "Jun", '07': "Jul", '08': "Aug", '09': "Sep", '10': "Oct", '11': "Nov", '12': "Dec"}

# s = "hello normal string"
# u = unicode( s, "utf-8" )
# backToBytes = u.encode( "utf-8" )

# Included for bot exclusion compliance (see http://en.wikipedia.org/wiki/Template:Bots)

user =  "BogBot"
regexp_ab = re.compile(r'\{\{(nobots|bots\|(allow=none|deny=.*?' + user + r'.*?|optout=all|deny=all))\}\}')

def Allowbots(text):
    if (regexp_ab.search(text)):
        return False
    return True

def PubMed_Citation(PubMed_ID):

  handle = Entrez.efetch(db="pubmed",id=PubMed_ID,rettype="medline",retmode="text")

  records = Medline.parse(handle)

  ref = ""
 
  for record in records:

	if record.has_key("AU"):
	  author_list = ""
	  for author in record["AU"]:
		author_list = author_list + ", " + author
	  author_list = author_list[2:]
	else:
	  author_list = ""
  
	if record.has_key("TI"):
	  title = record["TI"]
	  if (title[-1:] == "."):
	    title = title[:-1]
	else:
	  title = ""
	  
	if  record.has_key("TA"):
	  journal = record["TA"]
	else:
	  journal = ""
  
	if record.has_key("VI"):
	  volume = record["VI"]
	else:
	  volume = ""
  
	if record.has_key("IP"):
	  issue = record["IP"]
	else:
	  issue = ""
	
	if record.has_key("PG"):
	  pages = record["PG"]
	else:
	  pages = ""
	
	if record.has_key("DA"):
	  year = record["DA"][:4]
	  month = months[record["DA"][4:6]]
	else:
	  year = ""
	  month = ""
	
	if record.has_key("PMID"):
	  pmid = record["PMID"]
	else:
	  pmid = ""
	  
	if record.has_key("PMC"):
	  pmc = record["PMC"][3:]
	else:
	  pmc = ""
	
	if record.has_key("AID"):
	  doi = ""
	  for item in record["AID"]:
	    elements =item.split(" ")
	    if (len(elements) == 2):
	      if elements[1] == "[doi]":
	        doi = elements[0]
	else:
	  doi = ""
	  
	ref = ref + "<ref name=\"pmid" + record["PMID"] + "\">{{cite journal | author = " + author_list + " | title = " + title + " | journal = " + journal + " | volume = " + volume + " | issue = " + issue + " | pages = " + pages + " | year = " + year + " | month = " + month + " | pmid = " + pmid  + " | pmc = " + pmc + " | doi = " + doi + " }}</ref>"
  
  return ref

# compiled regular expression

regexp_opening_sentence = re.compile(r"\'\'\'.+\'\'\', also known as \'\'\'.+\'\'\', is a human \[\[gene\]\]\.")

regexp_enzyme = re.compile(r"ase\b")

# main loop

articles = []
f = open('/Users/boghog/progs/python/pywikipedia/test.tab', 'r')
for line in f:
  fields = line.split("\t")
  article = fields[0]
  UniProt_Name = fields[1]
  HUGO_Gene_Symbol = fields[3]
  PubMed_IDs = []
  if fields[7]:
    PubMed_IDs = fields[7].split(",")
#  if (article == "Wiki_name"):
#    break # skip header line

  log_string = "* [[" + article + "]]" 
  print log_string,

  site = wikipedia.getSite()
  page = wikipedia.Page(site, article)
  text = unicode(page.get(get_redirect = True))

  if not Allowbots(text):
    print ", bots not allowed, skipping article"
    break

  if (UniProt_Name and regexp_opening_sentence.search(text)):
  
    if regexp_enzyme.search(UniProt_Name):
      type = " is an [[enzyme]] "
    else:
      type = " is a [[protein]] "

    new_opening_sentence = "'''" + UniProt_Name + "'''" + type + "that in humans is encoded by the ''" + HUGO_Gene_Symbol + "'' [[gene]]."
  
    for PubMed_ID in PubMed_IDs:
      new_opening_sentence = new_opening_sentence + PubMed_Citation(PubMed_ID)
      text = re.sub(r'\*.*\{\{.*pmid.*=.*' + PubMed_ID + r'.*\}\}\n', "", text)
    
    text = regexp_opening_sentence.sub(new_opening_sentence, text)
    page.put(text, comment='edited opening sentence to make clear that article is about both protein and the gene that encodes it', watchArticle = None, minorEdit = False)
#     print text.encode('utf-8')
    print ", page updated"
  else:
    print ", page skipped"
    
wikipedia.stopme()