User:Alterego/signpost.py
Appearance
This is the python script I wrote for creating an RSS feed of the Wikipedia Signpost. Note: This is an out of date version that I salvaged. It has some debugging stuff in it. I'm still trying to do data rescue on my crashed server (just don't have much time)
Required Modules
[edit]It requires the following modules:
- Scrape 'N' Feed: http://www.crummy.com/software/ScrapeNFeed/
- BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
- PyRSS2Gen: http://www.dalkescientific.com/Python/PyRSS2Gen.html
# Copyright (c) 2005, Brian Mingus
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
import cgi
import urllib
import re
import time
import ScrapeNFeed
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
from PyRSS2Gen import RSSItem, Guid
from ftplib import FTP
# Change user agent to something not banned
class AppURLopener(urllib.FancyURLopener):
def __init__(self, *args):
# Replace this with what you want to use
self.version = "User:Alterego for Wikipedia Signpost RSS feed"
apply(urllib.FancyURLopener.__init__, (self,) + args)
urllib._urlopener = AppURLopener()
# Download the signpost and parse the xml structure with BeautifulStoneSoup
# Note that this is from Special:Export
url = 'http://en.wikipedia.org/wiki/Special:Export/Wikipedia:Wikipedia_Signpost'
wikitext = urllib.urlopen(url)
signpost = BeautifulStoneSoup(wikitext)
# Find all the links to articles
# Regexp: four char digit, dash, two char digit, dash, two char digit, any chars a until a pipe
signpost = re.findall(r"\d{4}\-\d{2}\-\d{2}/[^\|]+", signpost.text.string)
class signpostFeed(ScrapeNFeed.ScrapedFeed):
def HTML2RSS(self, headers, body):
soup = BeautifulSoup(body)
# Gather the html body of the story.
start = soup.h2.findNext('p')
end = soup.h2.findNext('div')
while start != end:
if start != None:
# If it hasn't already been added to body
if str(start) in textWeWant:
textWeWant += str(start)
# Move to next element
start = start.next
# Let BeautifulSoup make the code pretty
textWeWant = BeautifulSoup(textWeWant).prettify()
# And convert all relative links to absolute
textWeWant.replace('"/wiki/','"http://en.wikipedia.org/wiki/')
# And escape all html entities so they render in RSS
# Note that for html display, I use Magpie, which
# will unescape all these later
# Note that non-intelligent rss readers won't render the html
# but Bloglines does and that's what I use
textWeWant = cgi.escape(textWeWant)
print textWeWant
items = []
#if not self.hasSeen(textWeWant):
items.append(RSSItem(title=soup.h2.string,
description = textWeWant,
link = self.url))
self.addRSSItems(items)
for i in range(len(signpost)):
print 'i is :'
print i
print 'signpost is :'
print signpost[i]
signpostFeed.load("Wikipedia Signpost",
'http://en.wikipedia.org/wiki/Wikipedia:Wikipedia_Signpost/' + signpost[i].replace(' ','_'),
'The Wikipedia Signpost is a community-written and community-edited \
newspaper, covering events and stories related to the English Wikipedia.',
'F:\\signpost.rss',
'F:\\signpost.pickle')
# FTP server information for uploading the feed
ftp = FTP('','','')
ftp.set_debuglevel(2)
ftp.storlines('STOR','f:\signpost.rss')