User:BogBot/Source code/Task 02
Appearance
#!/usr/bin/python
# Bot Script to replace the opening sentence:
# '''G protein-coupled receptor 3''', also known as '''GPR3''', is a human [[gene]].
# with:
# '''G protein-coupled receptor 3''' is a [[protein]] that in humans is encoded by the ''GPR3'' [[gene]].
import re
import wikipedia
from Bio import Entrez
from Bio import Medline
Entrez.email = "boghog@mac.com"
months = {'01': "Jan", '02': "Feb", '03': "Mar", '04': "Apr", '05': "May", '06': "Jun", '07': "Jul", '08': "Aug", '09': "Sep", '10': "Oct", '11': "Nov", '12': "Dec"}
# s = "hello normal string"
# u = unicode( s, "utf-8" )
# backToBytes = u.encode( "utf-8" )
# Included for bot exclusion compliance (see http://en.wikipedia.org/wiki/Template:Bots)
user = "BogBot"
regexp_ab = re.compile(r'\{\{(nobots|bots\|(allow=none|deny=.*?' + user + r'.*?|optout=all|deny=all))\}\}')
def Allowbots(text):
if (regexp_ab.search(text)):
return False
return True
def PubMed_Citation(PubMed_ID):
handle = Entrez.efetch(db="pubmed",id=PubMed_ID,rettype="medline",retmode="text")
records = Medline.parse(handle)
ref = ""
for record in records:
if record.has_key("AU"):
author_list = ""
for author in record["AU"]:
author_list = author_list + ", " + author
author_list = author_list[2:]
else:
author_list = ""
if record.has_key("TI"):
title = record["TI"]
if (title[-1:] == "."):
title = title[:-1]
else:
title = ""
if record.has_key("TA"):
journal = record["TA"]
else:
journal = ""
if record.has_key("VI"):
volume = record["VI"]
else:
volume = ""
if record.has_key("IP"):
issue = record["IP"]
else:
issue = ""
if record.has_key("PG"):
pages = record["PG"]
else:
pages = ""
if record.has_key("DA"):
year = record["DA"][:4]
month = months[record["DA"][4:6]]
else:
year = ""
month = ""
if record.has_key("PMID"):
pmid = record["PMID"]
else:
pmid = ""
if record.has_key("PMC"):
pmc = record["PMC"][3:]
else:
pmc = ""
if record.has_key("AID"):
doi = ""
for item in record["AID"]:
elements =item.split(" ")
if (len(elements) == 2):
if elements[1] == "[doi]":
doi = elements[0]
else:
doi = ""
ref = ref + "<ref name=\"pmid" + record["PMID"] + "\">{{cite journal | author = " + author_list + " | title = " + title + " | journal = " + journal + " | volume = " + volume + " | issue = " + issue + " | pages = " + pages + " | year = " + year + " | month = " + month + " | pmid = " + pmid + " | pmc = " + pmc + " | doi = " + doi + " }}</ref>"
return ref
# compiled regular expression
regexp_opening_sentence = re.compile(r"\'\'\'.+\'\'\', also known as \'\'\'.+\'\'\', is a human \[\[gene\]\]\.")
regexp_enzyme = re.compile(r"ase\b")
# main loop
articles = []
f = open('/Users/boghog/progs/python/pywikipedia/test.tab', 'r')
for line in f:
fields = line.split("\t")
article = fields[0]
UniProt_Name = fields[1]
HUGO_Gene_Symbol = fields[3]
PubMed_IDs = []
if fields[7]:
PubMed_IDs = fields[7].split(",")
# if (article == "Wiki_name"):
# break # skip header line
log_string = "* [[" + article + "]]"
print log_string,
site = wikipedia.getSite()
page = wikipedia.Page(site, article)
text = unicode(page.get(get_redirect = True))
if not Allowbots(text):
print ", bots not allowed, skipping article"
break
if (UniProt_Name and regexp_opening_sentence.search(text)):
if regexp_enzyme.search(UniProt_Name):
type = " is an [[enzyme]] "
else:
type = " is a [[protein]] "
new_opening_sentence = "'''" + UniProt_Name + "'''" + type + "that in humans is encoded by the ''" + HUGO_Gene_Symbol + "'' [[gene]]."
for PubMed_ID in PubMed_IDs:
new_opening_sentence = new_opening_sentence + PubMed_Citation(PubMed_ID)
text = re.sub(r'\*.*\{\{.*pmid.*=.*' + PubMed_ID + r'.*\}\}\n', "", text)
text = regexp_opening_sentence.sub(new_opening_sentence, text)
page.put(text, comment='edited opening sentence to make clear that article is about both protein and the gene that encodes it', watchArticle = None, minorEdit = False)
# print text.encode('utf-8')
print ", page updated"
else:
print ", page skipped"
wikipedia.stopme()