Jump to content

User:DumZiBoT/reflinks.py: Difference between revisions

From Wikipedia, the free encyclopedia
Content deleted Content added
m minor bug
titles should never start with "Untitled"
Line 55: Line 55:
badtitles = {'en':
badtitles = {'en':
# starts with
# starts with
ur'(?is)^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)'
ur'(?is)^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled)'
# anywhere
# anywhere
+'|(404|page|file).*not *found'
+ur'|(404|page|file).*not *found'
# should never be
+ur'^(JSTOR: Accessing JSTOR)$'
# ends with
# ends with
+'|(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'}
+ur'|(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'}


linksInRef = re.compile(
linksInRef = re.compile(

Revision as of 02:15, 5 February 2008

Source of User:DumZiBoT/refLinks task.

Please edit this page if you think that my code needs improvements : The fact that I released this code on a wiki page is not meaningless.

That'd be nice to poke me if you plan to reuse my code, but again, it's up to you.

# -*- coding: utf-8 -*-
"""
This bot will search for references which are only made of a link 
without title, (i.e. <ref>[http://www.google.fr/]</ref> or 
<ref>http://www.google.fr/</ref>) and will fetch the html title from 
the link to use it as the title of the wiki link in the reference, i.e.
<ref>[http://www.google.fr/search?q=test test - Google Search]</ref>

The bot checks every 20 edits its talk page and a special stop page : if
one of these page has been edited, it stops.

&params;

-limit:n                Stops after n edits

-xml:dump.xml           Should be used instead of a simple page fetching
                        method from pagegenerators.py for performance and
                        load issues

Basic pagegenerators commands, -page, etc...
"""
# (C) 2008 - Nicolas Dumazet ( en:User:NicDumZ )
#
# Distributed under the terms of the GPL

from BeautifulSoup import UnicodeDammit
import sys, re, urllib2, httplib, socket, codecs
import wikipedia, pagegenerators

msg = { 'fr':u'Bot: Correction des refs. mal formatées (cf. [[Utilisateur:DumZiBoT/liensRefs|explications]])',
        'de':u'Bot: Korrektes Referenzformat (siehe [[:en:User:DumZiBoT/refLinks]])',
        'en':u'Bot: Converting [[User:DumZiBoT/refLinks|bare references]]'}

comment = {'fr':u'Titre généré automatiquement',
           'de':u'Automatisch generierter titel',
           'en':u'Bot generated title'}

stopPage = {'fr':u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
            'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe',
            'en':u'User:DumZiBoT/EditThisPageToStopMe'}
deadLinkTag = {'fr':u'',
               'de':u'',
               'en':u'{{dead link}}'}

soft404   = re.compile(ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog', re.IGNORECASE)
dirIndex  = re.compile(ur'^\w+://[^/]+/((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$', re.IGNORECASE)
domain    = re.compile(ur'^(\w+)://(?:www.|)([^/]+)')
badtitles = {'en':
                # starts with
                ur'(?is)^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled)'
                # anywhere
                +ur'|(404|page|file).*not *found'
                # should never be
                +ur'^(JSTOR: Accessing JSTOR)$'
                # ends with
                +ur'|(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'}

linksInRef = re.compile(
    # bracketed URLs
    ur'<ref>\s*\[?(?P<url>(?:http|https|ftp)://(?:' +
    # unbracketed with()
    ur'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|'+
    # unbracketed without ()
    ur'[^\[\]\s<>"]+[^\[\]\s\)\.:;\\,<>\?"]+|[^\[\]\s<>"]+))[!?,\s]*\]?\s*</ref>(?!\s*([Dd]ead[ _]*link|[Dd]l|404|[Bb]roken[ _]+link))')


class XmlDumpPageGenerator:
    def __init__(self, xmlFilename, xmlStart, namespaces):
        self.xmlFilename = xmlFilename
        self.xmlStart = xmlStart
        self.namespaces = namespaces
        self.skipping = bool(xmlStart)
        self.site = wikipedia.getSite()

        import xmlreader
        dump = xmlreader.XmlDump(self.xmlFilename)
        self.parser = dump.parse()

    def __iter__(self):
        return self
    
    def next(self):
        while True:
            try:
                entry = self.parser.next()
            except StopIteration:
                raise
            if self.skipping:
                if entry.title != self.xmlStart:
                    continue
                self.skipping = False
            page=wikipedia.Page(self.site, entry.title)
            if not self.namespaces == []:
                if page.namespace() not in self.namespaces:
                    continue
            if linksInRef.search(entry.text):
                return page

class ReferencesRobot:
    def __init__(self, generator, acceptall = False, limit = None):
        self.generator = generator
        self.acceptall = acceptall
        self.limit = limit
        self.site = wikipedia.getSite()
        self.stopPage = wikipedia.translate(self.site, stopPage)
        self.stopPageRevId = wikipedia.Page(self.site, 
                                            self.stopPage).latestRevision()
        self.META_CONTENT = re.compile(ur'(?i)<meta[^>]*content\-type[^>]*>')
        self.CHARSET = re.compile(ur'(?i)charset\s*=\s*(?P<enc>[^;>/]*)')
        self.TITLE = re.compile(ur'(?is)(?<=<title>).*?(?=</title>)')
        self.NON_HTML = re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>')
        allowed_media_types = ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml'
        self.MIME = re.compile(allowed_media_types)
        self.HTML_STRIP = re.compile(ur'(?is)</?\w+( [^</>]*=.*?)?>')
        self.titleBlackList = re.compile(wikipedia.translate(self.site, badtitles))
    
    def put_page(self, page, new):
        """
        Prints diffs between orginal and new (text), puts new text for page
        """
        wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" 
                         % page.title())
        wikipedia.showDiff(page.get(), new)
        if not self.acceptall:
            choice = wikipedia.inputChoice(u'Do you want to accept ' +
                                           u'these changes?', 
                                           ['Yes', 'No', 'All'], 
                                           ['y', 'N', 'a'], 'N')
            if choice in ['a', 'A']:
                self.acceptall = True
            if choice in ['y', 'Y']:
                page.put_async(new)
        if self.acceptall:
            try:
                page.put(new)
            except wikipedia.EditConflict:
                wikipedia.output(u'Skipping %s because of edit conflict' 
                                  % (page.title(),))
            except wikipedia.SpamfilterError, e:
                wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url))
            except wikipedia.PageNotSaved, error:
                wikipedia.output(u'Error putting page: %s' % (error.args,))
            except wikipedia.LockedPage:
                wikipedia.output(u'Skipping %s (locked page)' 
                                  % (page.title(),))
            except wikipedia.ServerError, e:
                wikipedia.output(u'Server Error : %s' % e)

    def refTitle(self, link, title):
        comm = wikipedia.translate(self.site, comment)
        return '<ref>[%s %s<!-- %s -->]</ref>' % (link, title, comm )

    def refLink(self, link):
        return '<ref>%s</ref>' % link
	
    def refDead(self, link):
        tag = wikipedia.translate(self.site, deadLinkTag)
        return '<ref>[%s]%s</ref>' % (link, tag)

    def httpError(self, err_num, link, pagetitleaslink):
        """Log HTTP Error"""
        wikipedia.output(u'HTTP error (%s) for %s on %s' 
                          % (err_num, link, pagetitleaslink),
                         toStdout = True)
        f = codecs.open(
                wikipedia.config.datafilepath(
                    'reflinks-httpErrorLog', 
                    'reflinks-%s-%s.txt' % (self.site.family.name, 
                                            self.site.lang)),
                'a', 'utf-8')
        f.write(u'%s: %s from %s\n' % (err_num, link, pagetitleaslink))
        f.close()

    def avoid_uppercase(self, title):
        """
        If title has more than 6 characters and has 75% of uppercase
        letters, title() it (Capitalize each word)
        """
        if len(title) <= 6:
            return title
        nb_upper = 0
        nb_letter = 0
        for letter in title:
            if letter.isUpper():
                nb_upper += 1
            if letter.isalpha():
                nb_letter += 1
            if letter.isdigit():
                return title
        if (nb_upper*100)/nb_letter > 75:
            return title.title()
        else:
            return title

    def run(self):
        """
        Runs the Bot
        """
        wikipedia.setAction(wikipedia.translate(self.site, msg))
        editedpages = 0
        for page in self.generator:
            try:
                # Load the page's text from the wiki
                new_text = page.get()
                if not page.canBeEdited():
                    wikipedia.output(u"You can't edit page %s" 
                                      % page.aslink())
                    continue
            except wikipedia.NoPage:
                wikipedia.output(u'Page %s not found' % page.aslink())
                continue
            except wikipedia.IsRedirectPage:
                wikipedia.output(u'Page %s is a redirect' % page.aslink())
                continue

            # Commons fixes for URLs
            new_text = re.sub(r'(http:?/+)+', 'http://', new_text)  # Silently correct http://http:/
            new_text = re.sub(r"(\[\w+://[^][<>\"\s]*?)''", r"\1 ''", new_text) # HTML pre-convert markup
            #new_text = re.sub(r'(\[http[s]?://[^][<>\s/]+)([ \]])', r'\1/\2', new_text) # adds / to the end of domains

            for match in linksInRef.finditer(wikipedia.removeDisabledParts(new_text)):
            #for each link to change
                link = match.group('url')
                #debugging purpose
                #print link
                try:

                    socket.setdefaulttimeout(10)
                    url = re.sub(u'#.*', '', link)
                    f = urllib2.urlopen(url)
                    #Try to get Content-Type from server
                    contentType = f.info().getheader('Content-Type')
                    if contentType and not self.MIME.search(contentType):
                        wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % link)
                        repl = self.refLink(link)
                        new_text = new_text.replace(match.group(), repl)
                        continue
                    # Test if the redirect was valid
                    redir = f.geturl()
                    if redir != link and domain.findall(redir) == domain.findall(link):
                        if soft404.search(redir) and not soft404.search(link):
                            wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect 404 : %s ' % link)
                            continue
                        if dirIndex.match(redir) and not dirIndex.match(link):
                            wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect to root : %s ' % link)
                            continue

                    # Read the first 1,000,000 bytes (0.95 MB)
                    linkedpagetext = f.read(1000000)
                    socket.setdefaulttimeout(None)

                except UnicodeError:
                    #example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html in [[fr:Cyanure]]
                    wikipedia.output(u'\03{lightred}Bad link\03{default} : %s in %s' % (url, page.aslink()))
                    continue
                except urllib2.HTTPError, e:
                    self.httpError(e.code, url, page.aslink())
                    if e.code == 410: # 410 Gone, indicates that the resource has been purposely removed
                        repl = self.refDead(link)
                        new_text = new_text.replace(match.group(), repl)
                    continue
                except (urllib2.URLError, socket.timeout, httplib.error), e:
                    wikipedia.output(u'Can\'t get page %s : %s' % (url, e))
                    continue
                except ValueError:
                    #Known bug of httplib, google for :
                    #"httplib raises ValueError reading chunked content"
                    continue
                
                #remove <script>/<style>/comments/CDATA tags
                linkedpagetext = self.NON_HTML.sub('', linkedpagetext)
                meta_content = self.META_CONTENT.search(linkedpagetext)
                enc = None
                if meta_content:
                    tag = meta_content.group()
                    if not contentType: 
                        contentType = tag
                    s = self.CHARSET.search(tag)
                    if s: 
                        enc = s.group('enc').strip("\"' ").lower()
                if not contentType:
                    wikipedia.output(u'No content-type found for %s' % link)
                    continue
                elif not self.MIME.search(contentType):
                    wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % link)
                    continue


                u = UnicodeDammit(linkedpagetext, overrideEncodings = [enc])
                titre = None
                if not u.unicode:
                    #Some page have utf-8 AND windows-1252 characters,
                    #Can't easily parse them. (~1 on 1000)
                    wikipedia.output(u'%s : Hybrid encoding...' % link)
                    continue

                for m in self.TITLE.finditer(u.unicode):
                    t = m.group()
                    if t:
                        #convert html entities
                        t = wikipedia.html2unicode(t)
                        t = re.sub(r'-+', '-', t)
                        #remove formatting, i.e long useless strings
                        t = re.sub(r'[\.+\-=]{4,}', ' ', t)
                        #remove \n and \r and Unicode spaces from titles
                        t = re.sub(r'(?u)\s', ' ', t)
                        #t = re.sub(r'[\n\r\t]', ' ', t)
                        #remove extra whitespaces
                        #remove leading and trailing ./;/,/-/_/+/ /
                        t = re.sub(r' +', ' ', t.strip('=.;,-+_ '))
                        if t:
                            titre = t
                            break;

                # Check to see if the documents contains 3 time as many printed characters as the title
                # Issues: framesets, image galleries, 
                #doctext = self.HTML_STRIP.sub(' ', linkedpagetext)
                #doctext = re.sub('(?u)\s+', ' ', doctext)
                #if len(doctext) < len(titre) * 3:
                #    wikipedia.output(u'%s : Page is too short' % link)
                #    continue


                if not titre:
                    repl = self.refLink(link)
                    new_text = new_text.replace(match.group(), repl)
                    wikipedia.output(u'%s : No title found...' % link)
                    continue
                if u'é' in titre:
                    repl = self.refLink(link)
                    new_text = new_text.replace(match.group(), repl)
                    wikipedia.output(u'%s : Hybrid encoding...' % link)
                    continue
                if self.titleBlackList.search(titre):
                    repl = self.refLink(link)
                    new_text = new_text.replace(match.group(), repl)
                    wikipedia.output(u'%s : Blacklisted title (%s)' % (link, titre))
                    continue

                titre = self.avoid_uppercase(titre)
                #avoid closing the link before the end
                titre = titre.replace(']', '&#93;')
                #avoid multiple } being interpreted as a template inclusion
                titre = titre.replace('}}', '}&#125;')
                #prevent multiple quotes being interpreted as '' or '''
                titre = titre.replace('\'\'', '\'&#39;')
                titre = wikipedia.unicode2html(titre, self.site.encoding())

                repl = self.refTitle(link,titre)
                new_text = new_text.replace(match.group(), repl)

            if new_text == page.get():
                wikipedia.output('No changes were necessary in %s' 
                                 % page.aslink())
                continue

            editedpages += 1
            self.put_page(page, new_text)
            if self.limit and editedpages >= self.limit:
                wikipedia.output('Edited %s pages, stopping.' % self.limit)
                return
            if editedpages % 20 == 0:
                wikipedia.output('\03{lightgreen}Checking stop page...\03{default}')
                actualRev = wikipedia.Page(self.site, 
                                           self.stopPage).latestRevision()
                if actualRev != self.stopPageRevId:
                    wikipedia.output(u'%s has been edited : Someone wants us to stop.' % self.stopPage)
                    return
            if self.site.messages:
                wikipedia.output(u'Bot has new messages. Better stop to check.')
                return

def main():
    genFactory = pagegenerators.GeneratorFactory()
    
    PageTitles = []
    xmlFilename = None
    always = False
    limit = None
    namespaces = []
    generator = None
    for arg in wikipedia.handleArgs():
        if arg.startswith('-page:'):
            PageTitles.append(arg[6:])
        elif arg.startswith('-namespace:'):
            try:
                namespaces.append(int(arg[11:]))
            except ValueError:
                namespaces.append(arg[11:])
        elif arg.startswith('-summary:'):
            wikipedia.setAction(arg[9:])
        elif arg == '-always':
            always = True
        elif arg.startswith('-limit:'):
            limit = int(arg[7:])
        elif arg.startswith('-xmlstart'):
            if len(arg) == 9:
                xmlStart = wikipedia.input(
                    u'Please enter the dumped article to start with:')
            else:
                xmlStart = arg[10:]
        elif arg.startswith('-xml'):
            if len(arg) == 4:
                xmlFilename = wikipedia.input(
                    u'Please enter the XML dump\'s filename:')
            else:
                xmlFilename = arg[5:]
        else:
            generator = genFactory.handleArg(arg)
    
    if xmlFilename:
        try:
            xmlStart
        except NameError:
            xmlStart = None
        generator = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces)
    elif PageTitles:
        pages= [wikipedia.Page(wikipedia.getSite(), PageTitle) for PageTitle in PageTitles]
        generator = iter(pages)
    if not generator:
        # syntax error, show help text from the top of this file
        wikipedia.showHelp('reflinks')
        wikipedia.stopme()
        sys.exit()
    generator = pagegenerators.PreloadingGenerator(generator, pageNumber = 50)
    generator = pagegenerators.RedirectFilterPageGenerator(generator)
    bot = ReferencesRobot(generator, always, limit)
    bot.run()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()