Jump to content

User:WildBot/dab template placer.py

From Wikipedia, the free encyclopedia
#!/usr/bin/python
# -*- coding: utf-8 -*-

"""
Consumer threads that take page titles and check them for ambiguous links.
If ambiguous links are found, 
  a template is added to the talk page listing those links
otherwise
  that template is removed
"""

import time, traceback, codecs, re, threading
import wikipedia, catlib
import watchlist_monitor

__metaclass__ = type

# Limit cycles while in trial
put_limit = 0
put_count = 0

# disambiguation page name format for "primary topic" disambiguations
# (Begriffsklärungen nach Modell 2)
primary_topic_format = {
    'ar': u'%s_(توضيح)',
    'cs': u'%s_(rozcestník)',
    'de': u'%s_(Begriffsklärung)',
    'en': u'%s_(disambiguation)',
    'fi': u'%s_(täsmennyssivu)',
    'hu': u'%s_(egyértelműsítő lap)',
    'ia': u'%s_(disambiguation)',
    'it': u'%s_(disambigua)',
    'lt': u'%s_(reikšmės)',
    'kk': u'%s_(айрық)',
    'ko': u'%s_(동음이의)',
    'nl': u'%s_(doorverwijspagina)',
    'no': u'%s_(peker)',
    'pl': u'%s_(ujednoznacznienie)',
    'pt': u'%s_(desambiguação)',
    'he': u'%s_(פירושונים)',
    'ru': u'%s_(значения)',
    'sr': u'%s_(вишезначна одредница)',
    'sv': u'%s_(olika betydelser)',
    'uk': u'%s_(значення)',
    }

# Ambiguous Links Found template
# The 1= part is to sidestep errors with article titles containing "="
ambiguous_template = {
    'en' : u'{{User:WildBot/msg|1=%s}}',
    }

# Ambiguous Links Found template locating regex
ambiguous_template_regex = {
    'en' : u'{{User:WildBot/msg.*}}',
    }

# Edit summary msg
summary_msg = {
    'en' : u'Found ambiguous links to %s',
    }

# Edit summary msg for a clean page
summary_all_gone_msg = {
    'en' : u'No ambiguous links left',
    }



class AllDisambiguationPages:
    def __init__(self, site=None):
        if site is None:
            site = wikipedia.getSite()
        self.site = site
        self.dab_file = 'disambiguations/all-disambiguation-pages.txt'
        self.articles = set()
        self.redir_file = 'disambiguations/redirects-from-incompletes-disambiguations.txt'
        self.redirects = set()

    def _load_category(self, cache_set, cache_filename, category):
        if not cache_set:
            #read in cache file line-by-line
            wikipedia.output('Loading ' + category)
            wikipedia.output('Reading cache file ' + cache_filename)
            try:
                f = codecs.open(cache_filename, 'r', 'utf-8')
                for line in f:
                    cache_set.add(line[:len(line)-1])
            except:
                #failed to read in cached file, read from site
                f = codecs.open(cache_filename, 'w', 'utf-8')
                wikipedia.output(u'Loading from site: this may take quite some time (as much as 30 minutes)')
                cat = catlib.Category(self.site, category)
                try:
                    for article in cat.articles():
                        cache_set.add(article.title())
                        f.write(article.title())
                        f.write('\n')
                finally:
                    f.close()
            finally:
                f.close()
            thesize = str(len(cache_set))
            wikipedia.output(category + u' loaded: ' + thesize + u' articles')

    def load(self):
        #Load in all dab pages (takes half a hour if you're on a slow link and a non-bot account)
        self._load_category(self.articles, self.dab_file, u"Category:All disambiguation pages")
        #Load in all dab redirects
        self._load_category(self.redirects, self.redir_file, u"Category:Redirects from incomplete disambiguations")

    def is_ambiguous(self, title):
#test for primary_topic_format to see if ambiguous links from here are acceptable
        return "(disambiguation)" not in title and (title in self.articles or title in self.redirects)

    def is_disambiguation_like(self, title):
        """
        Is this page a disambiguation page or a redirect to one?
        """
        return title in self.articles or title in self.redirects

    def ambiguous_titles_on_page(self, page):
        result = set()
        if self.is_disambiguation_like(page.title()):
            # Disambiguation pages are ignored
            return result
        
        is_bad = False
        links = page.linkedPages()
        for target in links:
            if self.is_ambiguous(target.title()):
                wikipedia.output(u'Ambiguous: >>>>%s<<<<' % target.title())
                is_bad = True
                result.add(target.title())
        return result

# global to share between all objects
dabs = AllDisambiguationPages()    


class MsgLeaver( threading.Thread ):
    # extended delay on altering the page if this is in it
    ignore_contents = {
        'de':(u'{{[Ii]nuse}}',
              u'{{[Ll]öschen}}',
            ),
        'en':(u'{{[Ii]nuse}}', 
              u'{{[Nn]ewpage}}', 
              u'{{[Uu]nderconstruction}}', 
            ),
        'fi':(u'{{[Tt]yöstetään}}',
            ),
        'kk':(u'{{[Ii]nuse}}',
              u'{{[Pp]rocessing}}',
            ),
        'nl':(u'{{wiu2}}',
              u'{{nuweg}}',
            ),
        'ru':(u'{{[Ii]nuse}}',
              u'{{[Pp]rocessing}}',
            ),
    }

    # Initialization stuff
    def __init__(self, shutdown):
        self.shutdown = shutdown
        dabs.load()
        # compile regular expressions
        self.ignore_contents_regexes = []
        self.site = wikipedia.getSite()
        if self.site in self.ignore_contents:
            for ig in self.ignore_contents[self.site]:
                self.ignore_contents_regexes.append(re.compile(ig))

        self.amb_template = wikipedia.translate(self.site, ambiguous_template)
        self.amb_regex = re.compile(wikipedia.translate(self.site, ambiguous_template_regex))

        self.ambiguous_tagged_log = wikipedia.config.datafilepath('disambiguations',
            'ambiguous-tagged-%s-%s.log' % (site.family.name, site.lang))
        self.ambiguous_skipped_log = wikipedia.config.datafilepath('disambiguations',
            'ambiguous-skipped-%s-%s.log' % (site.family.name, site.lang))
        
        threading.Thread.__init__(self)

    def logline(self, log_filename, logtext):
        try:
            f = codecs.open(log_filename, 'a+', 'utf-8')
            try:
                f.write(logtext)
            finally:
                f.close()
        except:
            return

    def checkContents(self, text):
        '''
        For a given text, returns False if none of the regular
        expressions given in the dictionary at the top of this class
        matches a substring of the text.
        Otherwise returns the substring which is matched by one of
        the regular expressions.
        '''
        for ig in self.ignore_contents_regexes:
            match = ig.search(text)
            if match:
                return match.group()
        return None

    def noteAmbiguousLinks(self, page, dab_titles):
        global put_count

        #Turn set into strings for template and edit summary
        titles_list= ''
        titles_bulleted= '<br />'
        dab_links= '[['
        any_title_contains_comma= False
        if dab_titles:
            for title in dab_titles:
                if ',' in title:
                    any_title_contains_comma= True
                titles_list += title
                titles_list += ', '
                titles_bulleted += '\n*'
                titles_bulleted += title
                dab_links += title
                dab_links += ']],[['
            dab_links = dab_links[:len(dab_links)-3]
            #In the template, use a bulleted list if any article title contains a comma
            if any_title_contains_comma:
                template_titles = titles_bulleted[:len(titles_bulleted)]
            else:
                template_titles = titles_list[:len(titles_list)-2]
            summary = wikipedia.translate(wikipedia.getSite(), summary_msg) % dab_links
        else:
           template_titles = ''
           summary = wikipedia.translate(wikipedia.getSite(), summary_all_gone_msg)

        try:
            self.content = page.get()
            ignoreReason = self.checkContents(self.content)
            if ignoreReason:
#add retry
                wikipedia.output('\n\nSkipping %s because it contains %s.\n\n' % (page.title(), ignoreReason))
                return
        except wikipedia.IsRedirectPage:
            wikipedia.output(u'Already redirected, skipping.')
            return
        except wikipedia.NoPage:
            wikipedia.output(u'Already deleted')
            return

        # what template text are we inserting?
        if template_titles != '':
            replace_template = (self.amb_template % template_titles)
        else:
            # No ambiguous links, removing template
            replace_template = '';
            
        #load talk page, munge it
        talkpage= page.toggleTalkPage()
        # make a backup of the original text so we can show the changes later
        oldtalk = unicode()
        try:
            oldtalk = talkpage.get(get_redirect=True)
            text = oldtalk
            # locate the existing template
            end_of_word_match = re.search(self.amb_regex, text)
            if end_of_word_match:
                # We know where to update the template
                template_start = end_of_word_match.start(0)
                template_end = end_of_word_match.end(0)
                if text[template_end] == '\n' and replace_template == '':
                    # we're removing the template and it's on the end of a line
                    # so remove that newline character
                    template_end += 1
            else:
                if replace_template > '':
                    # We didn't find the template so add it to the top of the page
                    template_start = 0
                    template_end = 0
                    replace_template += '\n'
                else:
                    # We were going to remove it, but it's not there
                    return
            text= text[ : template_start] + replace_template + text[template_end : ]
        except wikipedia.NoPage:
            text= replace_template

        if text == oldtalk:
            wikipedia.output(u'No changes have been made.')
        else:
            if len(oldtalk) > 0:
                wikipedia.output(u'The following changes have been made to %s\n' % talkpage.permalink())
            else:
                wikipedia.output(u'The following changes have been made to %s\n' % talkpage.aslink())
            wikipedia.showDiff(oldtalk, text)
            # save the page
            try:
                logtext = page.title() + u'|' + summary + '\n';
                put_count += 1
                if put_count <= put_limit:
#for statistic gathering purposes, initially only work on about half of the candiate articles                    
                    if len(page.title()) % 2 == 0:
                        talkpage.put_async(text,comment=summary,watchArticle=True,minorEdit=False)
                        wikipedia.output(u'Page saved')
                        self.logline(ambiguous_tagged_log, logtext)
                    else:
                        wikipedia.output(u'Page skipped for sampling purposes')
                        self.logline(self.ambiguous_tagged_log, logtext)
                else:
                    wikipedia.output(u'Run limit reached')
                    self.logline(self.ambiguous_tagged_log, logtext)
            except wikipedia.LockedPage:
#add retry?
                wikipedia.output(u'Page not saved: page is locked')
            except wikipedia.PageNotSaved, error:
                wikipedia.output(u'Page not saved: %s' % error.args)


class TalkCleaner( MsgLeaver ):
    # Initialization stuff
    def __init__(self, shutdown, queue):
        self.queue = queue
        MsgLeaver.__init__(self, shutdown)
        self.lasttime = watchlist_monitor.LastWatchlistCheck(self.site)

    def run(self):
        try:
            while not self.shutdown.isSet():
                page = self.queue.remove_page()
                try:
                    titles = dabs.ambiguous_titles_on_page(page)
                    if titles:
                        wikipedia.output(u'Ambiguous links remain on ' + page.title())
                    else:
                        wikipedia.output(u'No ambiguous links left on ' + page.title())
                    # This test is only necessary because of a bug in editTime()    
                    if not dabs.is_disambiguation_like(page.title()):
                        pagetime = page.editTime()
                        self.noteAmbiguousLinks(page, titles)
                        self.lasttime.put(pagetime) 
                    self.shutdown.wait(1)
                except wikipedia.NoPage:
                    wikipedia.output(u'seems already gone')
        except:
            self.shutdown.set()
            self.queue.add_page(None)
            wikipedia.stopme()
            raise


class NewPageChecker( MsgLeaver ):
    # Initialization stuff
    def __init__(self, shutdown, queue):
        self.queue = queue
        self.unambiguous_log = wikipedia.config.datafilepath('disambiguations',
            'unambiguous-skipped-%s-%s.log' % (site.family.name, site.lang))
        MsgLeaver.__init__(self, shutdown)

    def run(self):
        try:
            while not self.shutdown.isSet():
                page = self.queue.remove_page()
                try:
                    titles = dabs.ambiguous_titles_on_page(page)
                    if titles:
                        wikipedia.output(u'New page ' + page.title() + u' has ambiguous links...')
                        self.noteAmbiguousLinks(page, titles)
                        wikipedia.output(u'----- Current time: %s' % datetime.datetime.now())
                    else:
                        self.logline(self.unambiguous_log, page.title() + u'|\n')
                    self.shutdown.wait(1)
                except wikipedia.NoPage:
                    wikipedia.output(u'seems already gone')
        except:
            shutdown.set()
            self.queue.add_page(None)
            wikipedia.stopme()
            raise