User talk:Dc987test

This is a bot development test account created by the User:Dc987. Please leave a message on this page if you have any questions regarding this account. Installation:

pywikipedia (trunk) [1]
sudo apt-get install python-scipy
sudo apt-get install python-numpy
tre-0.7.5.tar.bz2 [2]
crm114-20100106-BlameMichelson.src.tar.gz [3]
crm.py (renamed to crm114.py) [4]

Current code: r.py

#!/usr/bin/python
# -*- coding: utf-8  -*-

"""
This bot goes over multiple revisions and tries bayesian approach to detect spam/vandalism.

Current code:
    In the history: detects reverts, reverted edits, revert wars.
    Calculate tokens 'lifetime' statistics. Can be used to differentiate between ham/spam tokens/edits.   
    Calculate page diffs. 
    Uses CRM114 (text categorization engine: http://crm114.sourceforge.net) to detect bad/good edits.



These command line parameters can be used to specify which pages to work on:
Example: ./r.py -xml:path/Wikipedia-2010031201*.xml

&params;
    -xml           Retrieve information from a local XML dump(s) (pages-articles
                   or pages-meta-current, see http://download.wikimedia.org).
                   Argument can be given as "-xml:filename_pattern".

All other parameters will be regarded as part of the title of a single page,
and the bot will only work on that single page.
"""

__version__='$Id: r.py 7909 2010-02-05 06:42:52Z Dc987 $'

import re, sys, time, calendar, difflib, string, math, hashlib, os, fnmatch
from collections import defaultdict

# pywikipedia (trunk 2010/03/15) in your PYTHONPATH, configured and running 
import wikipedia, pagegenerators, xmlreader, editarticle

# apt-get apt-get install python-numpy python-scipy 
import numpy as np
from scipy import stats

# CRM114, crm.py module by Sam Deane
import crm114   

# known good, known bad revisions
import k as known

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
    '&params;':     pagegenerators.parameterHelp,
}

# Summary messages in different languages
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# in fixes.py.
msg = {
    'ar':'.....: ..... ..... .....',
    'cs':'Robot odstranil odkaz na náv .lá',
    'de':'Bot: Entferne Selbstlinks',
    'en':'Robot: Removing selflinks',
    'es':'Bot: Eliminando enlaces al mismo artílo',
    'fr':'Robot: Enlè autoliens',
    'he':'...: .... ....... .. ... .....',
    'h':'Bot: Ömagukra mutatóvatkozák eltálísa',
    'ja':'....... ........',
    'ksh':'Bot: Ene Lengk vun de Sigg op sesch sellver, erus jenumme.',
    'nl':'Bot: verwijzingen naar pagina zelf verwijderd',
    'nn':'robot: fjerna sjønkjer',
    'no':'robot: fjerner selvlenker',
    'pl':'Robot automatycznie usuwa linki zwrotne',
    'pt':'Bot: Retirando link para o próo artigo',
    'r':'...: ...... .........-...... . ....... ... ',
    'zh':'...:......',
}


def locate(pattern):
    '''Locate all files matching supplied filename pattern in and below
    supplied root directory.'''
    (root, pattern) = os.path.split(pattern)
    if not root: root = os.curdir
    for path, dirs, files in os.walk(os.path.abspath(root)):
        for filename in fnmatch.filter(files, pattern):
            yield os.path.join(path, filename)



def timestamp_to_time(timestamp):
    '''Wikipedia format timestamp to unix time'''
    year = int(timestamp[0:4])
    month = int(timestamp[5:7])
    day = int(timestamp[8:10])
    hour = int(timestamp[11:13])
    min = int(timestamp[14:16])
    sec = int(timestamp[17:19])
    return calendar.timegm((year, month, day, hour, min, sec))



# generates token 'lifetime' statistics. how long in history token has generally managed to stay 
def analyse_tokens_lifetime(xmlFilenames):
    # stats.describe([1,2,3]): N, (min, max), mean, variance, (sqewness, coefficient of excess kurtosis) 
    # print(stats.describe([15, 47, 51, 99, 86, 86, 86, 86, 86, 22, 22, 22, 22, 22, 22, 51, 51, 51, 51, 51, 51, 54, 54, 54, 54, 54, 54, 55, 55, 55, 55, 55, 55, 11, 11, 11, 11, 11, 11, 431, 431, 431, 431]))
    
    data = defaultdict(list)
    prev = None
    for xmlFilename in xmlFilenames:
        dump = xmlreader.XmlDump(xmlFilename, allrevisions=True)
        revisions = dump.parse()

        for e in revisions:
            #wikipedia.output("Page Revision: %s (%s) %s\n" % (e.timestamp, timestamp_to_time(e.timestamp), e.comment))
            if prev:
                dTime = timestamp_to_time(e.timestamp) - timestamp_to_time(prev.timestamp);
                tokens = prev.text.split()
                for token in tokens:
                    if(len(token) > 40): token = token[40]
                    data[token].append(dTime)
            prev = e

    results = {} 
    for token, v in data.iteritems():
        ldescr = stats.describe(v)
        RSD = math.sqrt(ldescr[3])
        if np.isnan(RSD): RSD = 0
        results[token] = (ldescr[2], RSD, ldescr[0])

    # for token, v in results.iteritems():
    #   wikipedia.output("Token: %s %s" % (token, v))
        
    sorted_results = sorted(results.items(), key=lambda t: t[1][0])
    for v in sorted_results:
        print("[%d] %d +/- %d sec. : %s" % (v[1][2], v[1][0], v[1][1], v[0].encode('utf-8')))
        # wikipedia.output("[%d] %d +/- %d sec. : %s" % (v[1][2], v[1][0], v[1][1], v[0]))



def dump_cstats(cstats_bad, cstats_good):
    wikipedia.output("===================================================================================")
    wikipedia.output("Classification stats bad: %s" % cstats_bad)
    wikipedia.output("Classification stats good: %s" % cstats_good)
    wikipedia.output("Training bad: %d, good: %d" % (cstats_bad[5], cstats_good[5]))

    wikipedia.output("Vandalism detected: %d" % cstats_bad[2])
    wikipedia.output("Vandalism undetected: %d" % cstats_good[2])
    wikipedia.output("Good edits detected: %d" % (cstats_good[0] + cstats_good[1]))
    wikipedia.output("False positives (regular): \03{lightpurple}%d\03{default}" % cstats_bad[1])
    wikipedia.output("False positives (reverts): %d" % cstats_bad[0])
    wikipedia.output("===================================================================================")


# -------------------------------------------------------------------------
# returns: rev_info
# -1  : regular revision
# -2 : between duplicates, by single user (reverted, most likely bad)
# -3 : between duplicates, by other users (reverted, questionable)
# -4 : between duplicates, (revert that was reverted. revert war.)
# >=0: this revision is a duplicate of
# -------------------------------------------------------------------------
def analyse_reverts(xmlFilenames):
    rev_hashes = defaultdict(list)
    rev_usernames = []
    total_revisions = 0
    for xmlFilename in xmlFilenames:
        dump = xmlreader.XmlDump(xmlFilename, allrevisions=True)
        revisions = dump.parse() 

        # calculate page text hashes and duplicates lists 
        for e in revisions:
            rev_usernames.append(e.username)
            if(e.text):
                m = hashlib.md5()
                m.update(e.text.encode('utf-8'))
                rev_hashes[m.digest()].append(total_revisions)  # total_revisions is just an index really
            total_revisions += 1
            
    # Marking (-1) : regular revision
    rev_info = [-1] * total_revisions

    # Marking (-2, -4, >=0)
    # -2 : between duplicates, by single user (reverted, most likely bad)
    # -4 : between duplicates, (revert that was reverted. revert war.)        
    # >=0: this revision is a duplicate of
    # ------------------------------------------------------------------
    # Revision 54 (-1)      User0    Regular edit
    # Revision 55 (55)      User1    Regular edit
    # Revision 56 (-2)      User2    Vandalizm
    # Revision 57 (-2)      User2    Vandalizm
    # Revision 58 (-2)      User3    Correcting vandalizm, but not quite
    # Revision 59 (55)      User4    Revert to Revision 55
    rev_info = [-1] * total_revisions
    for m, indexes in rev_hashes.iteritems():
        if len(indexes) > 1:
            for i in indexes:
                rev_info[i] = indexes[0]

    reverted_to = -1
    for i in reversed(xrange(total_revisions)):
        if(reverted_to != -1):            
            if(rev_info[i] == -1): rev_info[i] = -2
            elif(rev_info[i] != reverted_to):
                wikipedia.output("Revert war: revision %d is a duplicate of %d was later reverted to %d" % (i, rev_info[i], reverted_to)) 
                rev_info[i] = -4
        elif(rev_info[i] >= 0): reverted_to = rev_info[i]  
        if(i == reverted_to): reverted_to = -1   
    
    # Marking (-3) : between duplicates, by other users (reverted, questionable)
    # Revision 54 (-1)  ->   (-1)                User0    Regular edit
    # Revision 55 (55)  ->   (55)                User1    Regular edit
    # Revision 56 (-2)  ->   (-2)                User2    Vandalizm
    # Revision 57 (-2)  ->   (-2)                User2    Vandalizm
    # Revision 58 (-2)  ->   (-3)                User3    Correcting vandalizm, but not quite
    # Revision 59 (55)  ->   (55)                User4    Revert to Revision 55
    username = None
    for i in xrange(total_revisions):
        if(rev_info[i] == -2): 
            if(username == None): username = rev_usernames[i]
            elif (username != rev_usernames[i]): rev_info[i] = -3
        else: username = None

    return rev_info

        

def analyse_crm114(xmlFilenames, rev_info):
    # CRM114
    c = crm114.Classifier( "data", [ "good", "bad" ] )
    cstats_good = [0] * 6
    cstats_bad = [0] * 6
    false_positives_ids = []
    i = 0

    for xmlFilename in xmlFilenames:
        #for i in reverts:
        #    wikipedia.output("Revision %d (%d)" % (i[0], i[1]));
        dump = xmlreader.XmlDump(xmlFilename, allrevisions=True)
        revisions = dump.parse()
        prev = None
        for e in revisions:
            #wikipedia.output("Revision %d (%d): %s by %s Comment: %s" % (i, rev_info[i], e.timestamp, e.username, e.comment))
            if prev:
                diff_time = timestamp_to_time(e.timestamp) - timestamp_to_time(prev.timestamp);               
                edit = []
                if(diff_time < 60): edit.append('ELFsuperfast')
                elif(diff_time < 3600): edit.append('ELFfast')
                elif(diff_time < 3600*24): edit.append('ELFregular')
                else: edit.append('ELFstable')
                # edit.append('ELFtitle')       
                # edit.append(e.title)
                # edit.append('ELFusername')       
                # edit.append(e.username)
                edit.append('ELFusername:' + e.username)
                if(e.ipedit): edit.append("ELFipedit")
                if(e.editRestriction): edit.append("ELFeditRest")
                if(e.moveRestriction): edit.append("ELFmoveRest")
                #edit.append(e.redirect)
                if(e.comment):
                    edit.append('ELFcomment')
                    edit.append(e.comment)
                else: edit.append('ELFnoComment')
                
                if(e.text and prev.text):                    
                    diff = difflib.ndiff(prev.text.split(), e.text.split())
                    for delta in diff:
                        if   delta[:1] == '+': edit.append('+' + delta[2:])
                        elif delta[:1] == '-': edit.append('-' + delta[2:])
                        else: continue
                elif(e.text and not prev.text): edit.append('ELFrevblank')
                elif(prev.text and not e.text): edit.append('ELFblanking')
                else: edit.append('ELFblank')
                
                edit_text = ' '.join(edit)
                #wikipedia.output(edit_text);
                
                (classification, probability) = c.classify(edit_text.encode('utf-8'))

                if(classification == "bad" and rev_info[i] == -1):
                    if(e.revisionid not in known.good  and  e.revisionid not in known.bad):
                        wikipedia.output("Class was: %s, prob was:%f" % (classification, probability))
                        wikipedia.output("Revision %d (%d): %s by %s : %s : %s" % (i, rev_info[i], e.timestamp, e.username, e.revisionid, e.comment))
                        wikipedia.output("Class was: %s, prob was:%f" % (classification, probability))
                        wikipedia.output(">>> \03{lightpurple}*************  %s   *************\03{default} <<<" % e.comment)
                        wikipedia.showDiff(prev.text, e.text)
                        wikipedia.output("%s\n" % edit_text)
                        wikipedia.output("------------------------------------------------------")
                        false_positives_ids.append(e.revisionid)
                #if(classification == "good" and rev_info[i] < -1):
                #    wikipedia.output(">>> \03{lightpurple}%s\03{default} <<<" % e.comment)
                #    wikipedia.showDiff(prev.text, e.text)

                if(True):
                    # correcting cstat (to get valid performance metrics)
                    cstat = -rev_info[i]
                    if(e.revisionid in known.good): cstat = 1
                    if(e.revisionid in known.bad): cstat = 2

                    if(classification == "bad"): cstats_bad[max(0, cstat)] += 1;
                    if(classification == "good"): cstats_good[max(0, cstat)] += 1;

                if(True):                  # testing
                    if(rev_info[i] == -2 and (probability < 0.75 or classification == "good")):
                        c.learn("bad", edit_text.encode('utf-8'))
                        cstats_bad[5] += 1
                        
                    if(rev_info[i] >= -1 and (probability < 0.75 or classification == "bad")):
                        c.learn("good", edit_text.encode('utf-8'))
                        cstats_good[5] += 1
                 
                    # wikipedia.output("------------------------------------------------------")
                if(i % 100 == 0):
                    dump_cstats(cstats_bad, cstats_good)

            prev = e
            i += 1
        
    dump_cstats(cstats_bad, cstats_good)
    wikipedia.output("False positives ids: %s" % false_positives_ids)


def main():
    pattern = None
    for arg in wikipedia.handleArgs():
        if arg.startswith('-xml'):
            if len(arg) == 4:
                pattern = wikipedia.input('Please enter the XML dump\'s filename:')
            else:
                pattern = arg[5:]
            
    if(not pattern):
        wikipedia.output('Usage: ./r.py -xml:path/Wikipedia-Single-Page-Dump-*.xml')
        return

    xmlFilenames = sorted(locate(pattern))
    wikipedia.output(u"Files: \n%s\n\n" % xmlFilenames)
    mysite = wikipedia.getSite()

    # analyse_tokens_lifetime(xmlFilenames)
    rev_info = analyse_reverts(xmlFilenames)
    analyse_crm114(xmlFilenames, rev_info)


if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()

Current code: k.py

# human verified 'good' revisions
good = ['270611313', '270782479', '279151059', '280880991', '283613884', '285516379', '285630939', '293173712', '293175959', '293283520', '298988140', '298989737', '298990094', '298990345', '298991025', '298992760', '298997996', '299004571', '299007204', '299019755', '299039475', '299041077', '299041737', '299043896', '299044925', '299178468', '299179512', '299181283', '299353088', '299376198', '299705276', '300072854', '301436474', '301448102', '301448305', '301452651', '301459870', '301460250', '301461058', '301466921', '301468381', '301469529', '301472382', '301473104', '301477837', '301480731', '302671228', '302676739', '302681436', '302686381', '302688078', '302788317', '303043897', '303182630', '303252143', '306110964', '308765897', '309347208', '314932032', '315082475', '318266199', '320604051', '323531231', '323999361', '324074563', '325132996', '332670505', '337363031', '341398662', '342099052', '342608588', '344241920', '344439801', '350285598', '350288772', '350294646', '267161919', '303254113', '308751495', '344951361', '350417615']

# human verified 'bad' revisions
bad = ['299163964', '316627070', '329061902', '268571925', '323407491']