User talk:Dc987test
This is a bot development test account created by the User:Dc987. Please leave a message on this page if you have any questions regarding this account. Installation:
- pywikipedia (trunk) [1]
- sudo apt-get install python-scipy
- sudo apt-get install python-numpy
- tre-0.7.5.tar.bz2 [2]
- crm114-20100106-BlameMichelson.src.tar.gz [3]
- crm.py (renamed to crm114.py) [4]
Current code: r.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bot goes over multiple revisions and tries bayesian approach to detect spam/vandalism.
Current code:
In the history: detects reverts, reverted edits, revert wars.
Calculate tokens 'lifetime' statistics. Can be used to differentiate between ham/spam tokens/edits.
Calculate page diffs.
Uses CRM114 (text categorization engine: http://crm114.sourceforge.net) to detect bad/good edits.
These command line parameters can be used to specify which pages to work on:
Example: ./r.py -xml:path/Wikipedia-2010031201*.xml
¶ms;
-xml Retrieve information from a local XML dump(s) (pages-articles
or pages-meta-current, see http://download.wikimedia.org).
Argument can be given as "-xml:filename_pattern".
All other parameters will be regarded as part of the title of a single page,
and the bot will only work on that single page.
"""
__version__='$Id: r.py 7909 2010-02-05 06:42:52Z Dc987 $'
import re, sys, time, calendar, difflib, string, math, hashlib, os, fnmatch
from collections import defaultdict
# pywikipedia (trunk 2010/03/15) in your PYTHONPATH, configured and running
import wikipedia, pagegenerators, xmlreader, editarticle
# apt-get apt-get install python-numpy python-scipy
import numpy as np
from scipy import stats
# CRM114, crm.py module by Sam Deane
import crm114
# known good, known bad revisions
import k as known
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp,
}
# Summary messages in different languages
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# in fixes.py.
msg = {
'ar':'.....: ..... ..... .....',
'cs':'Robot odstranil odkaz na náv .lá',
'de':'Bot: Entferne Selbstlinks',
'en':'Robot: Removing selflinks',
'es':'Bot: Eliminando enlaces al mismo artílo',
'fr':'Robot: Enlè autoliens',
'he':'...: .... ....... .. ... .....',
'h':'Bot: Ömagukra mutatóvatkozák eltálísa',
'ja':'....... ........',
'ksh':'Bot: Ene Lengk vun de Sigg op sesch sellver, erus jenumme.',
'nl':'Bot: verwijzingen naar pagina zelf verwijderd',
'nn':'robot: fjerna sjønkjer',
'no':'robot: fjerner selvlenker',
'pl':'Robot automatycznie usuwa linki zwrotne',
'pt':'Bot: Retirando link para o próo artigo',
'r':'...: ...... .........-...... . ....... ... ',
'zh':'...:......',
}
def locate(pattern):
'''Locate all files matching supplied filename pattern in and below
supplied root directory.'''
(root, pattern) = os.path.split(pattern)
if not root: root = os.curdir
for path, dirs, files in os.walk(os.path.abspath(root)):
for filename in fnmatch.filter(files, pattern):
yield os.path.join(path, filename)
def timestamp_to_time(timestamp):
'''Wikipedia format timestamp to unix time'''
year = int(timestamp[0:4])
month = int(timestamp[5:7])
day = int(timestamp[8:10])
hour = int(timestamp[11:13])
min = int(timestamp[14:16])
sec = int(timestamp[17:19])
return calendar.timegm((year, month, day, hour, min, sec))
# generates token 'lifetime' statistics. how long in history token has generally managed to stay
def analyse_tokens_lifetime(xmlFilenames):
# stats.describe([1,2,3]): N, (min, max), mean, variance, (sqewness, coefficient of excess kurtosis)
# print(stats.describe([15, 47, 51, 99, 86, 86, 86, 86, 86, 22, 22, 22, 22, 22, 22, 51, 51, 51, 51, 51, 51, 54, 54, 54, 54, 54, 54, 55, 55, 55, 55, 55, 55, 11, 11, 11, 11, 11, 11, 431, 431, 431, 431]))
data = defaultdict(list)
prev = None
for xmlFilename in xmlFilenames:
dump = xmlreader.XmlDump(xmlFilename, allrevisions=True)
revisions = dump.parse()
for e in revisions:
#wikipedia.output("Page Revision: %s (%s) %s\n" % (e.timestamp, timestamp_to_time(e.timestamp), e.comment))
if prev:
dTime = timestamp_to_time(e.timestamp) - timestamp_to_time(prev.timestamp);
tokens = prev.text.split()
for token in tokens:
if(len(token) > 40): token = token[40]
data[token].append(dTime)
prev = e
results = {}
for token, v in data.iteritems():
ldescr = stats.describe(v)
RSD = math.sqrt(ldescr[3])
if np.isnan(RSD): RSD = 0
results[token] = (ldescr[2], RSD, ldescr[0])
# for token, v in results.iteritems():
# wikipedia.output("Token: %s %s" % (token, v))
sorted_results = sorted(results.items(), key=lambda t: t[1][0])
for v in sorted_results:
print("[%d] %d +/- %d sec. : %s" % (v[1][2], v[1][0], v[1][1], v[0].encode('utf-8')))
# wikipedia.output("[%d] %d +/- %d sec. : %s" % (v[1][2], v[1][0], v[1][1], v[0]))
def dump_cstats(cstats_bad, cstats_good):
wikipedia.output("===================================================================================")
wikipedia.output("Classification stats bad: %s" % cstats_bad)
wikipedia.output("Classification stats good: %s" % cstats_good)
wikipedia.output("Training bad: %d, good: %d" % (cstats_bad[5], cstats_good[5]))
wikipedia.output("Vandalism detected: %d" % cstats_bad[2])
wikipedia.output("Vandalism undetected: %d" % cstats_good[2])
wikipedia.output("Good edits detected: %d" % (cstats_good[0] + cstats_good[1]))
wikipedia.output("False positives (regular): \03{lightpurple}%d\03{default}" % cstats_bad[1])
wikipedia.output("False positives (reverts): %d" % cstats_bad[0])
wikipedia.output("===================================================================================")
# -------------------------------------------------------------------------
# returns: rev_info
# -1 : regular revision
# -2 : between duplicates, by single user (reverted, most likely bad)
# -3 : between duplicates, by other users (reverted, questionable)
# -4 : between duplicates, (revert that was reverted. revert war.)
# >=0: this revision is a duplicate of
# -------------------------------------------------------------------------
def analyse_reverts(xmlFilenames):
rev_hashes = defaultdict(list)
rev_usernames = []
total_revisions = 0
for xmlFilename in xmlFilenames:
dump = xmlreader.XmlDump(xmlFilename, allrevisions=True)
revisions = dump.parse()
# calculate page text hashes and duplicates lists
for e in revisions:
rev_usernames.append(e.username)
if(e.text):
m = hashlib.md5()
m.update(e.text.encode('utf-8'))
rev_hashes[m.digest()].append(total_revisions) # total_revisions is just an index really
total_revisions += 1
# Marking (-1) : regular revision
rev_info = [-1] * total_revisions
# Marking (-2, -4, >=0)
# -2 : between duplicates, by single user (reverted, most likely bad)
# -4 : between duplicates, (revert that was reverted. revert war.)
# >=0: this revision is a duplicate of
# ------------------------------------------------------------------
# Revision 54 (-1) User0 Regular edit
# Revision 55 (55) User1 Regular edit
# Revision 56 (-2) User2 Vandalizm
# Revision 57 (-2) User2 Vandalizm
# Revision 58 (-2) User3 Correcting vandalizm, but not quite
# Revision 59 (55) User4 Revert to Revision 55
rev_info = [-1] * total_revisions
for m, indexes in rev_hashes.iteritems():
if len(indexes) > 1:
for i in indexes:
rev_info[i] = indexes[0]
reverted_to = -1
for i in reversed(xrange(total_revisions)):
if(reverted_to != -1):
if(rev_info[i] == -1): rev_info[i] = -2
elif(rev_info[i] != reverted_to):
wikipedia.output("Revert war: revision %d is a duplicate of %d was later reverted to %d" % (i, rev_info[i], reverted_to))
rev_info[i] = -4
elif(rev_info[i] >= 0): reverted_to = rev_info[i]
if(i == reverted_to): reverted_to = -1
# Marking (-3) : between duplicates, by other users (reverted, questionable)
# Revision 54 (-1) -> (-1) User0 Regular edit
# Revision 55 (55) -> (55) User1 Regular edit
# Revision 56 (-2) -> (-2) User2 Vandalizm
# Revision 57 (-2) -> (-2) User2 Vandalizm
# Revision 58 (-2) -> (-3) User3 Correcting vandalizm, but not quite
# Revision 59 (55) -> (55) User4 Revert to Revision 55
username = None
for i in xrange(total_revisions):
if(rev_info[i] == -2):
if(username == None): username = rev_usernames[i]
elif (username != rev_usernames[i]): rev_info[i] = -3
else: username = None
return rev_info
def analyse_crm114(xmlFilenames, rev_info):
# CRM114
c = crm114.Classifier( "data", [ "good", "bad" ] )
cstats_good = [0] * 6
cstats_bad = [0] * 6
false_positives_ids = []
i = 0
for xmlFilename in xmlFilenames:
#for i in reverts:
# wikipedia.output("Revision %d (%d)" % (i[0], i[1]));
dump = xmlreader.XmlDump(xmlFilename, allrevisions=True)
revisions = dump.parse()
prev = None
for e in revisions:
#wikipedia.output("Revision %d (%d): %s by %s Comment: %s" % (i, rev_info[i], e.timestamp, e.username, e.comment))
if prev:
diff_time = timestamp_to_time(e.timestamp) - timestamp_to_time(prev.timestamp);
edit = []
if(diff_time < 60): edit.append('ELFsuperfast')
elif(diff_time < 3600): edit.append('ELFfast')
elif(diff_time < 3600*24): edit.append('ELFregular')
else: edit.append('ELFstable')
# edit.append('ELFtitle')
# edit.append(e.title)
# edit.append('ELFusername')
# edit.append(e.username)
edit.append('ELFusername:' + e.username)
if(e.ipedit): edit.append("ELFipedit")
if(e.editRestriction): edit.append("ELFeditRest")
if(e.moveRestriction): edit.append("ELFmoveRest")
#edit.append(e.redirect)
if(e.comment):
edit.append('ELFcomment')
edit.append(e.comment)
else: edit.append('ELFnoComment')
if(e.text and prev.text):
diff = difflib.ndiff(prev.text.split(), e.text.split())
for delta in diff:
if delta[:1] == '+': edit.append('+' + delta[2:])
elif delta[:1] == '-': edit.append('-' + delta[2:])
else: continue
elif(e.text and not prev.text): edit.append('ELFrevblank')
elif(prev.text and not e.text): edit.append('ELFblanking')
else: edit.append('ELFblank')
edit_text = ' '.join(edit)
#wikipedia.output(edit_text);
(classification, probability) = c.classify(edit_text.encode('utf-8'))
if(classification == "bad" and rev_info[i] == -1):
if(e.revisionid not in known.good and e.revisionid not in known.bad):
wikipedia.output("Class was: %s, prob was:%f" % (classification, probability))
wikipedia.output("Revision %d (%d): %s by %s : %s : %s" % (i, rev_info[i], e.timestamp, e.username, e.revisionid, e.comment))
wikipedia.output("Class was: %s, prob was:%f" % (classification, probability))
wikipedia.output(">>> \03{lightpurple}************* %s *************\03{default} <<<" % e.comment)
wikipedia.showDiff(prev.text, e.text)
wikipedia.output("%s\n" % edit_text)
wikipedia.output("------------------------------------------------------")
false_positives_ids.append(e.revisionid)
#if(classification == "good" and rev_info[i] < -1):
# wikipedia.output(">>> \03{lightpurple}%s\03{default} <<<" % e.comment)
# wikipedia.showDiff(prev.text, e.text)
if(True):
# correcting cstat (to get valid performance metrics)
cstat = -rev_info[i]
if(e.revisionid in known.good): cstat = 1
if(e.revisionid in known.bad): cstat = 2
if(classification == "bad"): cstats_bad[max(0, cstat)] += 1;
if(classification == "good"): cstats_good[max(0, cstat)] += 1;
if(True): # testing
if(rev_info[i] == -2 and (probability < 0.75 or classification == "good")):
c.learn("bad", edit_text.encode('utf-8'))
cstats_bad[5] += 1
if(rev_info[i] >= -1 and (probability < 0.75 or classification == "bad")):
c.learn("good", edit_text.encode('utf-8'))
cstats_good[5] += 1
# wikipedia.output("------------------------------------------------------")
if(i % 100 == 0):
dump_cstats(cstats_bad, cstats_good)
prev = e
i += 1
dump_cstats(cstats_bad, cstats_good)
wikipedia.output("False positives ids: %s" % false_positives_ids)
def main():
pattern = None
for arg in wikipedia.handleArgs():
if arg.startswith('-xml'):
if len(arg) == 4:
pattern = wikipedia.input('Please enter the XML dump\'s filename:')
else:
pattern = arg[5:]
if(not pattern):
wikipedia.output('Usage: ./r.py -xml:path/Wikipedia-Single-Page-Dump-*.xml')
return
xmlFilenames = sorted(locate(pattern))
wikipedia.output(u"Files: \n%s\n\n" % xmlFilenames)
mysite = wikipedia.getSite()
# analyse_tokens_lifetime(xmlFilenames)
rev_info = analyse_reverts(xmlFilenames)
analyse_crm114(xmlFilenames, rev_info)
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()
Current code: k.py
# human verified 'good' revisions
good = ['270611313', '270782479', '279151059', '280880991', '283613884', '285516379', '285630939', '293173712', '293175959', '293283520', '298988140', '298989737', '298990094', '298990345', '298991025', '298992760', '298997996', '299004571', '299007204', '299019755', '299039475', '299041077', '299041737', '299043896', '299044925', '299178468', '299179512', '299181283', '299353088', '299376198', '299705276', '300072854', '301436474', '301448102', '301448305', '301452651', '301459870', '301460250', '301461058', '301466921', '301468381', '301469529', '301472382', '301473104', '301477837', '301480731', '302671228', '302676739', '302681436', '302686381', '302688078', '302788317', '303043897', '303182630', '303252143', '306110964', '308765897', '309347208', '314932032', '315082475', '318266199', '320604051', '323531231', '323999361', '324074563', '325132996', '332670505', '337363031', '341398662', '342099052', '342608588', '344241920', '344439801', '350285598', '350288772', '350294646', '267161919', '303254113', '308751495', '344951361', '350417615']
# human verified 'bad' revisions
bad = ['299163964', '316627070', '329061902', '268571925', '323407491']