User:Disambot/Source
Appearance
The Disambot source code is divided into three scripts:
- enwp.py provides the framework for interfacing with the English Wikipedia. It uses a combination of API calls and regular HTTP requests.
- disambot.py extracts a list of disambiguation pages (or more precisely, their titles) from working list.txt and puts each one through an inspection function which loads the page content, makes various changes, and saves any changes.
- private.py stores the username and password of the bot account.
These scripts are shown below:
enwp.py
[edit] import urllib, urllib2, ClientCookie, time
debug_mode = False
<nowiki>base_url = 'http://en.wikipedia.org/'</nowiki>
api_url = base_url + 'w/api.php'
def login(username, password):
url = globals()['api_url']
data = {
'action' : 'login',
'lgname' : username,
'lgpassword' : password,
'format' : 'xml'
}
if globals()['debug_mode']: print 'Logging in...'
response = ClientCookie.urlopen(url, urllib.urlencode(data)).read()
if globals()['debug_mode']: print 'Done'
def grab_page(title, render=False, expand_templates=False):
if render: ren_param = '&action=render'
else: ren_param = '&action=raw'
if expand_templates: expand_param = '&templates=expand'
else: expand_param = ''
url = globals()['base_url'] + 'w/index.php?title=' + title.replace(' ', '_') + ren_param + expand_param
if globals()['debug_mode']: print 'Fetching ' + url
response = ClientCookie.urlopen(url).read()
if globals()['debug_mode']: print str(len(response)) + ' bytes received'
return response
def edit_page(title, new_content, summary=''):
# First, obtain the required editing token and the timestamp of the last page edit
url = globals()['api_url']
data = {
'action' : 'query',
'prop' : 'info|revisions',
'intoken' : 'edit',
'titles' : title,
'format' : 'xml'
}
if globals()['debug_mode']: print 'Fetching ' + url
response = ClientCookie.urlopen(url, urllib.urlencode(data)).read()
if globals()['debug_mode']: print str(len(response)) + ' bytes received'
# Grab the supplied token from the XML-formatted response
token_start = response.find('edittoken="') + len('edittoken="')
token_end = response.find('"', token_start)
token = response[token_start : token_end]
if globals()['debug_mode']: print 'Token: ' + token
# Grab the last revision timestamp as well
ts_start = response.find('timestamp="') + len('edittoken="')
ts_end = response.find('"', ts_start)
ts = response[ts_start : ts_end]
if globals()['debug_mode']: print 'Base timestamp: ' + ts
# We just fetched a (last edit) timestamp of the form 2008-06-18T07:18:06Z; convert it to 20080618071806
edit_time = ts[0:4] + ts[5:7] + ts[8:10] + ts[11:13] + ts[14:16] + ts[17:19]
if globals()['debug_mode']: print 'Time of last edit: ' + str(edit_time)
# Get the current time and convert it to the 20080618071806 format as well
ct = time.gmtime()[0:6] # tuple of the form (year, month, day, hour, minute, second)
start_time = str(ct[0]).zfill(4) + str(ct[1]).zfill(2) + str(ct[2]).zfill(2) + str(ct[3]).zfill(2) + str(ct[4]).zfill(2) + str(ct[5]).zfill(2)
if globals()['debug_mode']: print 'Time of token retreival: ' + str(start_time)
# Next, use the API to push the new page content
'''
data = {
'action' : 'edit',
'title' : title,
'section' : 0,
'text' : new_content,
'token' : token,
'summary' : summary,
'bot' : True,
'basetimestamp' : ts,
'nocreate' : True,
'format' : 'xml'
}
'''
url = globals()['base_url'] + 'w/index.php?' + urllib.urlencode({ 'title':title, 'action':'submit' }, True)
data = {
'wpAntispam' : '',
'wpSection' : '',
'wpStarttime' : start_time,
'wpEdittime' : edit_time,
'wpScrolltop' : 0, # WTF does this do?
'wpTextbox1' : new_content,
'wpSummary' : summary,
'wpAutoSummary' : 'd41d8cd98f00b204e9800998ecf8427e', # not sure how this works
'wpSave' : 'Save page',
'wpEditToken' : token
}
data = urllib.urlencode(data)
req = urllib2.Request(url, data, { 'User-Agent' : 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9) Gecko/2008060309 Firefox/3.0' }, True)
if globals()['debug_mode']: print 'Sending data to ' + url
try:
response = ClientCookie.urlopen(req).read()
except urllib2.HTTPError, response:
if globals()['debug_mode']: print 'HTTP error encountered...'
except AttributeError: pass # seems to be a small of bug in ClientCookie
if globals()['debug_mode']: globals()['response'] = response
'''
result_start = response.find('result="') + len('result="')
result_end = response.find('"', result_start)
result = response[result_start : result_end]
if globals()['debug_mode']: print 'Result: ' + result
if result.lower() is 'failure':
return False
'''
return True
def sandbox_test():
edit_page('Wikipedia:Sandbox', 'Hello! This is a sandbox edit done using a [[Python (programming language)|Python]] script.')
disambot.py
[edit] import enwp, private
abbreviations = ( 'ac.', 'Co.', 'Corp.', 'deg.', 'ft.', 'Inc.', 'kg.', 'km.' 'mi.', 'mo.', 'oz.', 'qr.', 'qt.', 'yd.' )
# Log in to en-wp account
enwp.login(private.username, private.password)
def inspect(title):
print 'Inspecting ' + title + '...'
# Defaults
changed = False
complex_errors = ()
article_body = enwp.grab_page(title).strip()
article_body_orig = article_body
raw_html = enwp.grab_page(title, True)
# Skip set indices
if article_body.lower().find('[[category:set indices') is not -1:
return false
lines = article_body.splitlines()
# Main loop -- cycle through lines
for i, line in enumerate(lines):
# Skip short/empty lines
if len(line) < 5:
continue
# Strip extra whitespace
line = line.strip()
line_orig = line
# Replace ordered list items with unordered list items
if line[0] is '#':
line = '*' + line[1:]
# Handle list items
if line[0] is '*': # if this line is a list item
# Fix punctuation at the end
if line[-1] is '.' or line[-1] is ',' or line[-1] is ';': # if there is punctuation at the end
if line.count('.') >= 2 and line[line.find('.')+1] == ' ' and line[line.find('.')+2] is line[line.find('.')+2].upper(): # if multiple sentences
complex_errors += ('item with multiple sentences detected (line '+str(i)+')',)
else:
# Remove the punctuation, unless it's a proper abbreviation
abbrev = False
for a in globals()['abbreviations']:
if ' '+a.lower() is line[-1*(len(a)+1):].lower(): # if this abbreviation is at the end of the line
abbrev = True
break;
if not abbrev and line[-2] is line[-2].lower(): # not an abbreviation and not an acronym
line = line[0:-1] # remove punctuation (last character)
# Remove any bullets to assess the item itself
line_content = line
while line_content[0] is '*':
line_content = line_content[1:].strip()
line_content_orig = line_content
# Remove outer boldness if necessary
if line_content[0:3] is "'''":
count = 0
while line_content[0] is "'":
line_content = line_content[1:]
count += 1
if count is 3 and line_content[count:count+2] is '[[':
line_content.replace("'"*count, '', 1)
# Correct piped links
<nowiki>if line.find('|') is not -1 and line_content.find('[[') is 0 and line.find(']]') is not -1 and line.find('|') < line.find(']]'):</nowiki>
# There is a piped link at the beginning of this line -- remove it
# Get rid of pipe, checking for italics
p1 = line_content.find('|')
p2 = line_content.find(']]')
p3 = line_content.find("''", p1, p2)
if p3 is not -1 and line_content[p3+2] is not "'": # there are italics inside pipe
pass ####
#p4 = line_content.find("''", p3+2) # closing ''
#if p4 is -1:
#complex_errors += ('italicized text seems misformatted (line '+str(i)+')',)
#else:
#italicized = line_content[p3+2:p4]
else: # no italics --> simply remove pipe
line_content = line_content[:p1] + line_content[p2:]
# Check for wikilinks that are not the first word
if line_content.find('[[', 3) is not -1:
p1 = line_content.find('[[')
p2 = line_content.find('|')
p3 = line_content.find(']]')
if p2 is -1:
article_title = line_content[p1+2:p3]
else:
article_title = line_content[p2+1:p3]
p4 = raw_html.find(article_title+' (page does not exist)')
if (p1 is 0 or p1 is 2) and p4 is -1:
# The first word is wikilinked as it should be and not a red link, but there are other links that shouldn't be here
firstlink_end = line_content.find(']]')
if firstlink_end is -1:
# No closing "]]" ... something must be screwy
complex_errors += ('error in wikilink syntax (line '+str(i)+')',)
else:
firstlink_end += 2 # skip the ]]
<nowiki>while line_content.find('[[', firstlink_end) is not -1 and line_content.find(']]', firstlink_end) is not -1:</nowiki> # links remain
link_start = line_content.find('[[', firstlink_end)
link_pipe = line_content.find('|' , firstlink_end)
link_end = line_content.find(']]', firstlink_end)
if link_start > link_end:
complex_errors += ('error in wikilink syntax (line '+str(i)+')',)
break
new = line_content[:link_start]
if link_pipe is -1 or link_pipe > link_end: # no pipe in link of interest
new += line_content[link_start+2:link_end] + line_content[link_end+2:]
else: # there is a pipe in link of interest
new += line_content[link_pipe+1:link_end] + line_content[link_end+2:]
line_content = new # update
else:
# There are inappropriate wikilinks, but if we remove them we'll be left with no links. Human review needed.
complex_errors += ('item contains link, but not in the proper place (line '+str(i)+')',)
# Update the line without screwing with its spacing
line = line[:len(line)-len(line_content_orig)] + line_content
# Replace old version of this line with new one if we've changed anything
if line is not line_orig:
lines[i] = line
changed = True
# Implode lines back into one big string
article_body = "\n".join(lines)
# Check for external links
links = article_body.count('[http')
if links > 0:
complex_errors += ('contains '+str(links)+' external link'+('s'*(links!=1)),)
# Finish up
if lines is not article_body_orig.splitlines(False):
# Update the article
print "\tMaking changes..."
<nowiki>enwp.edit_page(title, article_body, 'Cleaning up disambiguation page in accordance with [[Wikipedia:Manual of Style (disambiguation pages)]]')</nowiki>
if len(complex_errors) > 0:
# Add the article to list of potential atrocities, along with notes, unless it's already there
atrocities = enwp.grab_page('User:Disambot/Potential atrocities')
<nowiki>if atrocities.find("[[" + title + "]]") == -1: # if not already listed</nowiki>
<nowiki>atrocities += "\n\n[[" + title + "]]"</nowiki>
for this in complex_errors:
atrocities += "\n* " + this
print "\tListing on potential atrocities..."
<nowiki>enwp.edit_page('User:Disambot/Potential atrocities', atrocities, 'Adding [['+title+']]')</nowiki>
def go():
article_list = open('working list', 'r')
for title in article_list: inspect(title.strip())
article_list.close()
private.py
[edit] username = '(not shown)'
password = '(not shown)'