Jump to content

User:Alterego/signpost.py

From Wikipedia, the free encyclopedia

This is the python script I wrote for creating an RSS feed of the Wikipedia Signpost. Note: This is an out of date version that I salvaged. It has some debugging stuff in it. I'm still trying to do data rescue on my crashed server (just don't have much time)

Required Modules

[edit]

It requires the following modules:


#    Copyright (c) 2005, Brian Mingus
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.

#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.

#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

import cgi
import urllib
import re
import time
import ScrapeNFeed
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
from PyRSS2Gen import RSSItem, Guid
from ftplib import FTP

# Change user agent to something not banned
class AppURLopener(urllib.FancyURLopener):
   def __init__(self, *args):
                      # Replace this with what you want to use
      self.version = "User:Alterego for Wikipedia Signpost RSS feed"
      apply(urllib.FancyURLopener.__init__, (self,) + args)

urllib._urlopener = AppURLopener()

# Download the signpost and parse the xml structure with BeautifulStoneSoup
# Note that this is from Special:Export
url = 'http://en.wikipedia.org/wiki/Special:Export/Wikipedia:Wikipedia_Signpost'
wikitext = urllib.urlopen(url)
signpost = BeautifulStoneSoup(wikitext)

# Find all the links to articles
# Regexp: four char digit, dash, two char digit, dash, two char digit, any chars a until a pipe
signpost = re.findall(r"\d{4}\-\d{2}\-\d{2}/[^\|]+",  signpost.text.string)


class signpostFeed(ScrapeNFeed.ScrapedFeed):

      def HTML2RSS(self, headers, body):

        soup = BeautifulSoup(body)
        
        # Gather the html body of the story.        
        start = soup.h2.findNext('p')
        end = soup.h2.findNext('div')

        while start != end:
          
          if start != None:
            
            # If it hasn't already been added to body
            if str(start) in textWeWant:
              
              textWeWant += str(start)
              
          # Move to next element
          start = start.next
        
        # Let BeautifulSoup make the code pretty
        textWeWant = BeautifulSoup(textWeWant).prettify()
        
        # And convert all relative links to absolute
        textWeWant.replace('"/wiki/','"http://en.wikipedia.org/wiki/')
        
        # And escape all html entities so they render in RSS
        # Note that for html display, I use Magpie, which
        # will unescape all these later
        # Note that non-intelligent rss readers won't render the html
        # but Bloglines does and that's what I use
        textWeWant = cgi.escape(textWeWant)
        print textWeWant

        items = []
        #if not self.hasSeen(textWeWant):
        items.append(RSSItem(title=soup.h2.string,
                             description = textWeWant,
                             link = self.url))
        self.addRSSItems(items)

for i in range(len(signpost)):
  print 'i is :'
  print i
  print 'signpost is :'
  print signpost[i]
  signpostFeed.load("Wikipedia Signpost",
                    'http://en.wikipedia.org/wiki/Wikipedia:Wikipedia_Signpost/' + signpost[i].replace(' ','_'),
                    'The Wikipedia Signpost is a community-written and community-edited \
                     newspaper, covering events and stories related to the English Wikipedia.',
                    'F:\\signpost.rss',
                    'F:\\signpost.pickle')
  
# FTP server information for uploading the feed

ftp = FTP('','','')
ftp.set_debuglevel(2)
ftp.storlines('STOR','f:\signpost.rss')