User:Sminthopsis84/temp

From Wikipedia, the free encyclopedia
Python 3 changes required:

dateutil must be imported (as before, note that pip has become pip3)

		Syntax changes required
Replace
 import urllib
 import urllib2
with
 import urllib.request
 import urllib.error
(though urllib.error isn't used in Visviva's code)

Change urllib calls, e.g.,
 request=urllib2.Request
becomes
 request=urllib.request.Request

Replace all print statements with function calls, e.g.,
 print str(u),"already done"
becomes
 print (str(u),"already done")

Exception handling requires the keyword "as":
 except Exception, e:
      print (str(e))
becomes
 except Exception as e:
      print (str(e))

Reserved words must be changed:
 sorted
 max

		Run-time errors
Strict typing is now required for byte strings versus Unicode strings.
 urlpaths=re.findall('"[^"]+-stub-meta-history\d.*?\.xml\.gz"',dumpspage)
becomes
 urlpaths=re.findall(b'"[^"]+-stub-meta-history\d.*?\.xml\.gz"',dumpspage)

 self.urls=[self.dumpsurl+x.replace('"','') for x in urlpaths]
becomes
 self.urls=[bytes(self.dumpsurl,'utf-8')+x.replace(b'"',b'') for x in urlpaths]

 print ("Downloading "+u)
becomes
 print ("Downloading ", u)

urllib.request.urlretrieve requires a string or a Request object for the url
<more to come>

------------------ Visviva's code as converted by me ------------------
import gzip
import os
import re
import time
import urllib.request
import urllib.error

from sys import stdout
from collections import defaultdict

class Downloader:
    def __init__(self):
        self.dumpsurl="http://dumps.wikimedia.your.org/enwiki/latest/"
        self.headers={'User-agent' : 'JumpingSpider/0.0'}
        self.counters=[]
        self.trackers=[]
        self.matchups={}
        self.replaced_users=set()

    def process(self): # get URLs of all pre-combination stub-meta-history files
        request=urllib.request.Request(self.dumpsurl,headers=self.headers)
        dumpspage=urllib.request.urlopen(request,timeout=240).read()
        urlpaths=re.findall('"[^"]+-stub-meta-history\d.*?\.xml\.gz"',dumpspage)
        self.urls=[self.dumpsurl+x.replace('"','') for x in urlpaths]
        
    def go(self): # to download, process, and delete the segmented stub-meta-history files in sequence
        doneurls=[x[0] for x in self.counters]
        for u in self.urls:
            if u in doneurls:
                print (u,"already done")
                continue
            filepath="stubhist_working.xml.gz"
            print ("Downloading "+u)
            done=False
            while not done:
                try:
                    urllib.request.urlretrieve(u, filepath)
                    done=True
                except Exception as e:
                    print (str(e))
                    time.sleep(10)
            print ("Reading....")
            gfile=gzip.GzipFile(filepath)
            with gfile:
                self.counters.append((u,self.countusers(gfile))) # avoid dict of dicts, too slippery
            print ()
            print ("Deleting ....")
            os.unlink(filepath)

    def run(self, filepaths): # to just use already-downloaded DB files
        for f in filepaths:
            print (f)
            self.countem=self.countusers(f)
            open("wikicount_dump.txt","w").write(self.dump())
        
    def dump(self):
        output=""
        for c in self.counters:
            path=c[0]
            dixie=c[1]
            for d in dixie.keys():
                newline=path+"\t"+str(d)+"\t"+str(dixie[d])+"\n"
                output+=newline
        return output
        
    def countusers(self,path): 
        import dateutil.parser
        if path.endswith(".gz"):
            file=gzip.GzipFile(path)
        else:
            file=open(path)
        i=0
        users= defaultdict(int)
        tracker=defaultdict(set)
        reading=False
        reading_rev=False
        try:
            for line in file:
                i+=1 
                line=line.strip()
                if line.startswith("<page"):
                    reading=True
                    revisions=[]
                    reading_rev=False
                    thetitle=""
                    continue
                if reading is not True:
                    continue
                else:
                    if line.startswith("</page>"):
                        sortedrevs=list(revisions)
                        sortedrevs.sort()
                        username=sortedrevs[0][1]
                        if username != revisions[0][1]:
                            self.replaced_users.add((thetitle,username,revisions[0][1]))
                        users[username]+=1
                        tracker[username].add(thetitle)
                        self.matchups[thetitle]=username
                        reading=False
                        reading_rev=False
                        stdout.write("\r") #put progress counter here to minimize waste
                        stdout.flush()
                        stdout.write(str(i))
                        continue
                    elif reading_rev is True:
                        if line.startswith("<timestamp>"):
                            timestamp=line.split(">")[1].split("<")[0]
                            continue
                        elif line.startswith("<ip") or line.startswith("<username"): 
                            if not timestamp: # just in case
                                print ("No timestamp!",thetitle)
                            else:
                                thetime=dateutil.parser.parse(timestamp)
                                if line.startswith("<ip>"): #need to avoid counting pages created by IP for the first registered user to edit
                                    username="IP:"+line.split(">")[1].split("<")[0].strip()
                                elif line.startswith("<username />"):
                                    username=""
                                elif line.startswith("<username"):
                                    username=line.split(">")[1].split("<")[0].strip()
                                revisions.append((thetime,username))
                            reading_rev=False
                    elif line.startswith("<revision"):
                        reading_rev=True
                        timestamp=""
                        username=""
                        continue
                    elif line.startswith("<title>"):
                        thetitle=line.split(">")[1].split("<")[0].strip()
                        continue
                    elif line.startswith("<ns>"):
                        if not line.startswith("<ns>0<"):
                            reading=False
                            continue
                    elif line.startswith("<redirect"):
                        reading=False
                        continue
        except Exception as e:
            print (str(e))
        for u in users.keys():
            if len(tracker[u]) != users[u]:
                print ("Discrepancy:",u,str(len(tracker[u])),str(users[u]))
        self.counters.append((path,users))
        self.trackers.append((path,tracker))
        return users

        
        
def sortusers(users):
    sorted1=[]
    for u in users.keys():
        sorted1.append((users[u],u))
    sorted1.sort()
    sorted1.reverse()
    return sorted1
    
def summate(counters):
        output={}
        for o in counters:
                print (o[0],len(output),sum(output.values()))
                for k in o[1].keys():
                        if k in output.keys():
                                output[k]+=o[1][k]
                        else:
                                output[k]=o[1][k]
        return output
        
def summate2(counters):
        output=defaultdict(int)
        for o in counters:
                print (o[0],len(output),sum(output.values()))
                for k in o[1].keys():
                        output[k]+=o[1][k]
        return output

def truncate(summation,max1=10000):
    userlist=[]
    for s in summation.keys():
        userlist.append((summation[s],s))
    print (len(userlist))
    userlist.sort()
    userlist.reverse()
    userlist=userlist[:max1]
    return userlist
    
def get_current_totals():
    output=[]
    pagename="Wikipedia:List_of_Wikipedians_by_article_count/Data"
    url="http://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&rvprop=content&format=xml" % pagename
    page=urllib.request.urlopen(url,timeout=60).read()
    page=page.split("<rev ")[1].split(">",1)[1].split("<")[0]
    pieces=page.split("|}")[0].split("|-")[2:]
    pieces=[x.strip() for x in pieces]
    for p in pieces:
        data=[x.strip() for x in p.split("|") if x.strip()]
        if not data: 
            continue
        rank=int(data[0])
        username=data[1]
        count=int(data[2].replace(",",""))
        output.append(tuple([rank,username,count]))
    return output
    
def get_mismatches(current,summation):
    mismatched=[] # list of tuples: (discrepancy,username,current,new)
    currentdict=dict([(x[1],x[2]) for x in current])
    for c in currentdict.keys():
        if c in summation.keys():
            if int(summation[c]) != int(currentdict[c]):
                diff=int(summation[c])-int(currentdict[c])
                mismatched.append((diff,c,currentdict[c],summation[c]))
    mismatched.sort()
    mismatched.reverse()
    return mismatched
    
def getanons():
    pagename="Wikipedia:List of Wikipedians by number of edits/Anonymous".replace(" ","_")
    url="http://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&rvprop=content&format=xml" % pagename
    anonpage=urllib.request.urlopen(url,timeout=60).read()
    anonpage=anonpage.split("==\n",1)[1]
    anons=[x.split("]]")[0] for x in anonpage.split("[[User:")[1:]]
    print (str(len(anons))+" anons")
    return anons
    
def replaceanons(wikitext,anons=[]):
    if not anons:
        anons=getanons()
    for anon in anons:
        catchme="| %s\n" % anon
        if catchme in wikitext:
            print ("Effacing "+anon)
            wikitext=wikitext.replace(catchme, "| [Placeholder]\n")
    return wikitext

def dumpusers(foo,userlist=[]): # Downloader object
    outdict=defaultdict(set)
    for tracker in foo.trackers:
        path=tracker[0]
        for user in tracker[1].keys():
            outdict[user] |= tracker[1][user]
    outtext=""
    for user in outdict.keys():
        newline=user+"\t"
        newline="[["
        newline+="]] - [[".join(outdict[user])
        newline+="]]\n"
        outtext+=newline
    return outtext
    
def makedatapage(userlist): #as returned by truncate()
    text="""{| class="wikitable sortable"
|- style="white-space:nowrap;"
! No.
! User
! Article count
|-"""
    for u in userlist:
        number=str(userlist.index(u)+1)
        count=str(u[0])
        newlines="""
| %s
| %s
| %s
|-""" % (number,u[1],count)
        text += newlines
    text += "\n|}"
    return text
    
    
def totalprep(foo): # take completed Downloader and make Data page
    summation=summate2(foo.counters)
    truncation=truncate(summation,5000)
    datapage=makedatapage(truncation)
    datapage=replaceanons(datapage)
    return datapage