User:WildBot/dab template placer.py
Appearance
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Consumer threads that take page titles and check them for ambiguous links.
If ambiguous links are found,
a template is added to the talk page listing those links
otherwise
that template is removed
"""
import time, traceback, codecs, re, threading
import wikipedia, catlib
import watchlist_monitor
__metaclass__ = type
# Limit cycles while in trial
put_limit = 0
put_count = 0
# disambiguation page name format for "primary topic" disambiguations
# (Begriffsklärungen nach Modell 2)
primary_topic_format = {
'ar': u'%s_(توضيح)',
'cs': u'%s_(rozcestník)',
'de': u'%s_(Begriffsklärung)',
'en': u'%s_(disambiguation)',
'fi': u'%s_(täsmennyssivu)',
'hu': u'%s_(egyértelműsítő lap)',
'ia': u'%s_(disambiguation)',
'it': u'%s_(disambigua)',
'lt': u'%s_(reikšmės)',
'kk': u'%s_(айрық)',
'ko': u'%s_(동음이의)',
'nl': u'%s_(doorverwijspagina)',
'no': u'%s_(peker)',
'pl': u'%s_(ujednoznacznienie)',
'pt': u'%s_(desambiguação)',
'he': u'%s_(פירושונים)',
'ru': u'%s_(значения)',
'sr': u'%s_(вишезначна одредница)',
'sv': u'%s_(olika betydelser)',
'uk': u'%s_(значення)',
}
# Ambiguous Links Found template
# The 1= part is to sidestep errors with article titles containing "="
ambiguous_template = {
'en' : u'{{User:WildBot/msg|1=%s}}',
}
# Ambiguous Links Found template locating regex
ambiguous_template_regex = {
'en' : u'{{User:WildBot/msg.*}}',
}
# Edit summary msg
summary_msg = {
'en' : u'Found ambiguous links to %s',
}
# Edit summary msg for a clean page
summary_all_gone_msg = {
'en' : u'No ambiguous links left',
}
class AllDisambiguationPages:
def __init__(self, site=None):
if site is None:
site = wikipedia.getSite()
self.site = site
self.dab_file = 'disambiguations/all-disambiguation-pages.txt'
self.articles = set()
self.redir_file = 'disambiguations/redirects-from-incompletes-disambiguations.txt'
self.redirects = set()
def _load_category(self, cache_set, cache_filename, category):
if not cache_set:
#read in cache file line-by-line
wikipedia.output('Loading ' + category)
wikipedia.output('Reading cache file ' + cache_filename)
try:
f = codecs.open(cache_filename, 'r', 'utf-8')
for line in f:
cache_set.add(line[:len(line)-1])
except:
#failed to read in cached file, read from site
f = codecs.open(cache_filename, 'w', 'utf-8')
wikipedia.output(u'Loading from site: this may take quite some time (as much as 30 minutes)')
cat = catlib.Category(self.site, category)
try:
for article in cat.articles():
cache_set.add(article.title())
f.write(article.title())
f.write('\n')
finally:
f.close()
finally:
f.close()
thesize = str(len(cache_set))
wikipedia.output(category + u' loaded: ' + thesize + u' articles')
def load(self):
#Load in all dab pages (takes half a hour if you're on a slow link and a non-bot account)
self._load_category(self.articles, self.dab_file, u"Category:All disambiguation pages")
#Load in all dab redirects
self._load_category(self.redirects, self.redir_file, u"Category:Redirects from incomplete disambiguations")
def is_ambiguous(self, title):
#test for primary_topic_format to see if ambiguous links from here are acceptable
return "(disambiguation)" not in title and (title in self.articles or title in self.redirects)
def is_disambiguation_like(self, title):
"""
Is this page a disambiguation page or a redirect to one?
"""
return title in self.articles or title in self.redirects
def ambiguous_titles_on_page(self, page):
result = set()
if self.is_disambiguation_like(page.title()):
# Disambiguation pages are ignored
return result
is_bad = False
links = page.linkedPages()
for target in links:
if self.is_ambiguous(target.title()):
wikipedia.output(u'Ambiguous: >>>>%s<<<<' % target.title())
is_bad = True
result.add(target.title())
return result
# global to share between all objects
dabs = AllDisambiguationPages()
class MsgLeaver( threading.Thread ):
# extended delay on altering the page if this is in it
ignore_contents = {
'de':(u'{{[Ii]nuse}}',
u'{{[Ll]öschen}}',
),
'en':(u'{{[Ii]nuse}}',
u'{{[Nn]ewpage}}',
u'{{[Uu]nderconstruction}}',
),
'fi':(u'{{[Tt]yöstetään}}',
),
'kk':(u'{{[Ii]nuse}}',
u'{{[Pp]rocessing}}',
),
'nl':(u'{{wiu2}}',
u'{{nuweg}}',
),
'ru':(u'{{[Ii]nuse}}',
u'{{[Pp]rocessing}}',
),
}
# Initialization stuff
def __init__(self, shutdown):
self.shutdown = shutdown
dabs.load()
# compile regular expressions
self.ignore_contents_regexes = []
self.site = wikipedia.getSite()
if self.site in self.ignore_contents:
for ig in self.ignore_contents[self.site]:
self.ignore_contents_regexes.append(re.compile(ig))
self.amb_template = wikipedia.translate(self.site, ambiguous_template)
self.amb_regex = re.compile(wikipedia.translate(self.site, ambiguous_template_regex))
self.ambiguous_tagged_log = wikipedia.config.datafilepath('disambiguations',
'ambiguous-tagged-%s-%s.log' % (site.family.name, site.lang))
self.ambiguous_skipped_log = wikipedia.config.datafilepath('disambiguations',
'ambiguous-skipped-%s-%s.log' % (site.family.name, site.lang))
threading.Thread.__init__(self)
def logline(self, log_filename, logtext):
try:
f = codecs.open(log_filename, 'a+', 'utf-8')
try:
f.write(logtext)
finally:
f.close()
except:
return
def checkContents(self, text):
'''
For a given text, returns False if none of the regular
expressions given in the dictionary at the top of this class
matches a substring of the text.
Otherwise returns the substring which is matched by one of
the regular expressions.
'''
for ig in self.ignore_contents_regexes:
match = ig.search(text)
if match:
return match.group()
return None
def noteAmbiguousLinks(self, page, dab_titles):
global put_count
#Turn set into strings for template and edit summary
titles_list= ''
titles_bulleted= '<br />'
dab_links= '[['
any_title_contains_comma= False
if dab_titles:
for title in dab_titles:
if ',' in title:
any_title_contains_comma= True
titles_list += title
titles_list += ', '
titles_bulleted += '\n*'
titles_bulleted += title
dab_links += title
dab_links += ']],[['
dab_links = dab_links[:len(dab_links)-3]
#In the template, use a bulleted list if any article title contains a comma
if any_title_contains_comma:
template_titles = titles_bulleted[:len(titles_bulleted)]
else:
template_titles = titles_list[:len(titles_list)-2]
summary = wikipedia.translate(wikipedia.getSite(), summary_msg) % dab_links
else:
template_titles = ''
summary = wikipedia.translate(wikipedia.getSite(), summary_all_gone_msg)
try:
self.content = page.get()
ignoreReason = self.checkContents(self.content)
if ignoreReason:
#add retry
wikipedia.output('\n\nSkipping %s because it contains %s.\n\n' % (page.title(), ignoreReason))
return
except wikipedia.IsRedirectPage:
wikipedia.output(u'Already redirected, skipping.')
return
except wikipedia.NoPage:
wikipedia.output(u'Already deleted')
return
# what template text are we inserting?
if template_titles != '':
replace_template = (self.amb_template % template_titles)
else:
# No ambiguous links, removing template
replace_template = '';
#load talk page, munge it
talkpage= page.toggleTalkPage()
# make a backup of the original text so we can show the changes later
oldtalk = unicode()
try:
oldtalk = talkpage.get(get_redirect=True)
text = oldtalk
# locate the existing template
end_of_word_match = re.search(self.amb_regex, text)
if end_of_word_match:
# We know where to update the template
template_start = end_of_word_match.start(0)
template_end = end_of_word_match.end(0)
if text[template_end] == '\n' and replace_template == '':
# we're removing the template and it's on the end of a line
# so remove that newline character
template_end += 1
else:
if replace_template > '':
# We didn't find the template so add it to the top of the page
template_start = 0
template_end = 0
replace_template += '\n'
else:
# We were going to remove it, but it's not there
return
text= text[ : template_start] + replace_template + text[template_end : ]
except wikipedia.NoPage:
text= replace_template
if text == oldtalk:
wikipedia.output(u'No changes have been made.')
else:
if len(oldtalk) > 0:
wikipedia.output(u'The following changes have been made to %s\n' % talkpage.permalink())
else:
wikipedia.output(u'The following changes have been made to %s\n' % talkpage.aslink())
wikipedia.showDiff(oldtalk, text)
# save the page
try:
logtext = page.title() + u'|' + summary + '\n';
put_count += 1
if put_count <= put_limit:
#for statistic gathering purposes, initially only work on about half of the candiate articles
if len(page.title()) % 2 == 0:
talkpage.put_async(text,comment=summary,watchArticle=True,minorEdit=False)
wikipedia.output(u'Page saved')
self.logline(ambiguous_tagged_log, logtext)
else:
wikipedia.output(u'Page skipped for sampling purposes')
self.logline(self.ambiguous_tagged_log, logtext)
else:
wikipedia.output(u'Run limit reached')
self.logline(self.ambiguous_tagged_log, logtext)
except wikipedia.LockedPage:
#add retry?
wikipedia.output(u'Page not saved: page is locked')
except wikipedia.PageNotSaved, error:
wikipedia.output(u'Page not saved: %s' % error.args)
class TalkCleaner( MsgLeaver ):
# Initialization stuff
def __init__(self, shutdown, queue):
self.queue = queue
MsgLeaver.__init__(self, shutdown)
self.lasttime = watchlist_monitor.LastWatchlistCheck(self.site)
def run(self):
try:
while not self.shutdown.isSet():
page = self.queue.remove_page()
try:
titles = dabs.ambiguous_titles_on_page(page)
if titles:
wikipedia.output(u'Ambiguous links remain on ' + page.title())
else:
wikipedia.output(u'No ambiguous links left on ' + page.title())
# This test is only necessary because of a bug in editTime()
if not dabs.is_disambiguation_like(page.title()):
pagetime = page.editTime()
self.noteAmbiguousLinks(page, titles)
self.lasttime.put(pagetime)
self.shutdown.wait(1)
except wikipedia.NoPage:
wikipedia.output(u'seems already gone')
except:
self.shutdown.set()
self.queue.add_page(None)
wikipedia.stopme()
raise
class NewPageChecker( MsgLeaver ):
# Initialization stuff
def __init__(self, shutdown, queue):
self.queue = queue
self.unambiguous_log = wikipedia.config.datafilepath('disambiguations',
'unambiguous-skipped-%s-%s.log' % (site.family.name, site.lang))
MsgLeaver.__init__(self, shutdown)
def run(self):
try:
while not self.shutdown.isSet():
page = self.queue.remove_page()
try:
titles = dabs.ambiguous_titles_on_page(page)
if titles:
wikipedia.output(u'New page ' + page.title() + u' has ambiguous links...')
self.noteAmbiguousLinks(page, titles)
wikipedia.output(u'----- Current time: %s' % datetime.datetime.now())
else:
self.logline(self.unambiguous_log, page.title() + u'|\n')
self.shutdown.wait(1)
except wikipedia.NoPage:
wikipedia.output(u'seems already gone')
except:
shutdown.set()
self.queue.add_page(None)
wikipedia.stopme()
raise