Thursday, November 29, 2007

RSS reader update

If 'onlyupdates' flag is set it will run on the background and print only the updates.



import sgmllib
import feedparser
from threading import Thread
import time
import re
import urllib

class MyParse(sgmllib.SGMLParser):
def get_vals(self):
return self.datalist
def parse(self, data):
self.feed(data)
self.close()
def __init__(self, verbose=0):
sgmllib.SGMLParser.__init__(self, verbose=0)
self.datalist = []
def start_outline(self, attributes):
attrdict = {}
for name, value in attributes:
attrdict[name]=value
self.datalist.append(attrdict)


class FetchFeed(Thread):
def __init__(self, xmlurl):
Thread.__init__(self)
self.xmlurl = xmlurl
self.feedresult = None

def run(self):
d = feedparser.parse(self.xmlurl)
if d.feed.has_key('title'):
self.feedresult = d
def strip_html(text):
def fixup(m):
text = m.group(0)
if text[:1] == "<":
return ""
if text[:2] == "&#":
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
elif text[:1] == "&":
import htmlentitydefs
entity = htmlentitydefs.entitydefs.get(text[1:-1])
if entity:
if entity[:2] == "&#":
try:
return unichr(int(entity[2:-1]))
except ValueError:
pass
else:
return unicode(entity, "iso-8859-1")
return text
return re.sub("(?s)<[^>]*>|&#?\w+;", fixup, text)


class ParseOpml(MyParse):

def __init__(self, opmldoc = None):
MyParse.__init__(self, verbose=0)
self.lastenty = None
if opmldoc:
fileobj = open(opmldoc);
filedata = fileobj.read()
fileobj.close()
MyParse.parse(self, filedata)
self.blogsdata = MyParse.get_vals(self)
del self.blogsdata[0]
self.opmldocflag = 1
self.resultlimit = 3

else:
self.blogsdata = ['http://mvblogs.org/mvblogs.xml']
self.opmldocflag = 0
self.resultlimit = None
self.feedresults = []

self.threadlist = []
self.lastentry = None

def getfeeds(self, min=0, max=1):

print self.blogsdata[0]
for i in self.blogsdata[min:max]:

if self.opmldocflag:
t = FetchFeed(i['xmlurl'])
else:
t = FetchFeed(self.blogsdata[0])
t.start()
self.threadlist.append(t)

for tl in self.threadlist:
tl.join()
self.getfeed(tl)

def getfeed(self, tl):
results = []
if tl.feedresult:
tlfeed = tl.feedresult.feed
tlresult = tl.feedresult.entries

results.append(tlfeed)
results.append(tlresult)
self.feedresults.append(results)
if not self.lastentry:self.lastentry = tlresult[len(tlresult)-1]

def printfeeds(self, onlyupdates = 'false'):
for i in self.feedresults:

if not self.resultlimit:self.resultlimit = len(i[1])
for j in range(self.resultlimit):
if onlyupdates:
if self.lastentry.date<i[1][j].date:
continue

if self.opmldocflag:
print i[0]['author']
print i[0]['title']

print i[1][j].date
print i[1][j].title
print i[1][j].link
print strip_html(i[1][j].description)
print "#"*100
if onlyupdates:
self.lastentry=None



#parseobj = ParseOpml('mvblogs-export.xml')
#parseobj.getfeeds(0,3)
parseobj = ParseOpml()
while(1):
parseobj.getfeeds()
parseobj.printfeeds('true')
time.sleep(30)


No comments: