commit fe766ccbbc6c9807e78aef287a43b5b630e750e3 Author: Alexandre Dulaunoy Date: Sun Apr 14 14:05:30 2013 +0200 Initial rss-tools crappy code from 2007 imported. diff --git a/README.md b/README.md new file mode 100644 index 0000000..61320ea --- /dev/null +++ b/README.md @@ -0,0 +1,52 @@ +RSS tools +========= + +Following an old idea from 2007 published on my blog post called [RSS Everything?](http://www.foo.be/cgi-bin/wiki.pl/2007-02-11_RSS_Everything), this is a set of tools to +work on RSS (Really Simple Syndication) in an [Unix way](http://en.wikipedia.org/wiki/Unix_philosophy). + +The code committed in this repository is old Python code from 2007, it might break your PC, kill your cat or the Flying Spaghetti Monster might loose a ball. + +Forks and pull requests more than welcome. You have been warned the code was just there to experiment RSS workflows. + +Requirements +------------ + +* Python 2.x +* Feedparser + +rsscluster.py +------------- + +rsscluster.py is a simple script to cluster items from an rss feed based on a time interval (expressed in number of days). +The maxitem is the number of item maximum kept after the clustering. An example use is for del.icio.us/pinboard.in where +you can have a lot of bookmarks during one day and you want to cluster them in one single item per a defined time slot in RSS or in (X)HTML. + + rsscluster.py --interval 2 --maxitem 20 "http://del.icio.us/rss/adulau" >adulau.xml + +rsscount.py +----------- + +rsscount.py is a simple script to count how many items are in a RSS feed per day. This is used to build the [wiki creativity index](http://www.foo.be/cgi-bin/wiki.pl/WikiCreativityIndex). There is no limit for url arguments. + + rsscount.py "" | sort + +rssdir.py +--------- + +rssdir is a simply-and-dirty script to rssify any directory on the filesystem. + + rssdir.py --prefix http://www.foo.be/cours/ . >rss.xml + +rssinternetdraft.py +------------------- + +rssinternetdraft is a simple test to read a mbox file and generate an RSS from the subject. + +rssmerge.py +----------- + +rssmerge.py is a simple script to gather rss feed and merge them in reverse time order. Useful to keep track of recent events. + + python2.5 --maxitem 30 --output phtml "http://api.flickr.com/services/feeds/photos_public.gne?id=31797858@N00&lang=en-us&format=atom" "http://www.foo.be/cgi-bin/wiki.pl?action=journal&tile=AdulauMessyDesk" + + diff --git a/bin/rsscluster.py b/bin/rsscluster.py new file mode 100644 index 0000000..43e4f77 --- /dev/null +++ b/bin/rsscluster.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# a at foo dot be - Alexandre Dulaunoy - http://www.foo.be/cgi-bin/wiki.pl/RssAny +# +# rsscluster.py is a simple script to cluster items from an rss feed based on a +# time interval (expressed in number of days). The maxitem is the +# number of item maximum after the clustering. +# +# an example use is for del.icio.us where you can have a lot of bookmarks during +# one day and you want to cluster them in one single item in RSS or in (X)HTML. +# +# example of use : +# python2.5 rsscluster.py --interval 5 --maxitem 20 "http://del.icio.us/rss/adulau" >adulau.xml + +import feedparser +import sys,os +import time +import datetime +import xml.etree.ElementTree as ET +import hashlib +from optparse import OptionParser + +#print sys.stdout.encoding +version = "0.2" + +feedparser.USER_AGENT = "rsscluster.py "+ version + " +http://www.foo.be/" + + +def date_as_rfc(value): + return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(value)) + + +def build_rss(myitem,maxitem): + + RSSroot = ET.Element( 'rss', {'version':'2.0'} ) + RSSchannel = ET.SubElement( RSSroot, 'channel' ) + + ET.SubElement( RSSchannel, 'title' ).text = 'RSS cluster of ' + str(url) +' per '+options.interval+' days' + ET.SubElement( RSSchannel, 'link' ).text = str(url) + ET.SubElement( RSSchannel, 'description' ).text = 'RSS cluster of ' + str(url) +' per '+options.interval+' days' + ET.SubElement( RSSchannel, 'generator' ).text = 'by rsscluster.py ' + version + ET.SubElement( RSSchannel, 'pubDate' ).text = date_as_rfc(time.time()) + + for bloodyitem in myitem[0:maxitem]: + + RSSitem = ET.SubElement ( RSSchannel, 'item' ) + ET.SubElement( RSSitem, 'title' ).text = 'clustered data of ' + date_as_rfc(float(bloodyitem[0])) +" for "+ str(url) + ET.SubElement( RSSitem, 'pubDate' ).text = date_as_rfc(float(bloodyitem[0])) + ET.SubElement( RSSitem, 'description').text = bloodyitem[1] + h = hashlib.md5() + h.update(bloodyitem[1]) + ET.SubElement( RSSitem, 'guid').text = h.hexdigest() + + RSSfeed = ET.ElementTree(RSSroot) + feed = ET.tostring(RSSroot) + return feed + + +def complete_feed(myfeed): + + myheader = '' + return myheader + str(myfeed) + +def DaysInSec(val): + + return int(val)*24*60*60 + +usage = "usage: %prog [options] url" +parser = OptionParser(usage) + +parser.add_option("-m","--maxitem",dest="maxitem",help="maximum item to list in the feed, default 200") +parser.add_option("-i","--interval",dest="interval",help="time interval expressed in days, default 1 day") + +#2007-11-10 11:25:51 +pattern = '%Y-%m-%d %H:%M:%S' + +(options, args) = parser.parse_args() + +if options.interval == None: + options.output = 1 + +if options.maxitem == None: + options.maxitem = 200 + + +if len(args) != 1: + parser.print_help() + parser.error("incorrect number of arguments") + +allitem = {} +url = args[0] + +d = feedparser.parse(url) + +interval = DaysInSec(options.interval) + +previousepoch = [] +clusteredepoch = [] +tcluster = [] + +for el in d.entries: + + eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.modified_parsed)) + elepoch = int(time.mktime(time.strptime(unicode(eldatetime), pattern))) + + if len(previousepoch): + + #print el.link, int(previousepoch[0])-int(elepoch), interval + + if len(clusteredepoch): + value = clusteredepoch.pop() + else: + value = "" + + clusteredepoch.append(value+" "+el.title+"") + + + if not ((int(previousepoch[0])-int(elepoch)) < interval): + + value = clusteredepoch.pop() + + starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0]) + endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop()) + clusteredepoch.append(value+ " from: "+unicode(starttimetuple.ctime())+" to: "+unicode(endttimetuple.ctime())) + startdatelist = unicode(previousepoch[0]),unicode(clusteredepoch[len(clusteredepoch)-1]) + tcluster.append(startdatelist) + del previousepoch[0:len(previousepoch)] + del clusteredepoch[0:len(clusteredepoch)] + else: + clusteredepoch.append(" "+el.title+"") + previousepoch.append(elepoch) + +# if last cluster list was not complete, we add the time period information. +if len(previousepoch): + value = clusteredepoch.pop() + starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0]) + endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop()) + clusteredepoch.append(value+ " from: "+unicode(starttimetuple.ctime())+" to: "+unicode(endttimetuple.ctime())) + del previousepoch[0:len(previousepoch)] + + +tcluster.sort() +tcluster.reverse() +print complete_feed(build_rss(tcluster,int(options.maxitem))) + + + diff --git a/bin/rsscount.py b/bin/rsscount.py new file mode 100644 index 0000000..1f1b735 --- /dev/null +++ b/bin/rsscount.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# a at foo dot be - Alexandre Dulaunoy - http://www.foo.be/cgi-bin/wiki.pl/RssAny +# +# rsscount.py is a simple script to count how many items in a RSS feed per day +# +# The output is epoch + the number of changes separated with a tab. +# +# This is used to build statistic like the wiki creativity index. +# + +import feedparser +import sys,os +import time +import datetime +from optparse import OptionParser + + +feedparser.USER_AGENT = "rsscount.py +http://www.foo.be/" + + +usage = "usage: %prog url(s)" +parser = OptionParser(usage) + + +(options, args) = parser.parse_args() + +if args is None: + print usage + + +counteditem = {} + +for url in args: + + d = feedparser.parse(url) + + for el in d.entries: + + try: + eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.modified_parsed)) + except AttributeError: + # discard RSS without pubDate grrr... + break + + + eventdate = eldatetime.isoformat(' ').split(' ',1) + edate = eventdate[0].replace("-","") + + if counteditem.has_key(edate): + counteditem[edate] = counteditem[edate] + 1 + else: + counteditem[edate] = 1 + + +for k in counteditem.keys(): + + print unicode(k).encode("utf-8")+"\t"+ unicode(counteditem[k]).encode("utf-8") + + diff --git a/bin/rssdir.py b/bin/rssdir.py new file mode 100644 index 0000000..f363c4d --- /dev/null +++ b/bin/rssdir.py @@ -0,0 +1,117 @@ +# rssdir.py +# a at foo dot be - Alexandre Dulaunoy - http://www.foo.be/cgi-bin/wiki.pl/RssAny +# +# rssdir is a simply-and-dirty script to rssify any directory on the filesystem. +# +# an example of use on the current directory : +# +# python2.5 /usr/local/bin/rssdir.py --prefix http://www.foo.be/cours/ . >rss.xml +# +# Don't really need python2.5 except for ElementTree but you are free to install it. + +import os, fnmatch +import time +import sys +import xml.etree.ElementTree as ET +from optparse import OptionParser + +version = "0.1" + +# recursive list file function from the ASPN cookbook +def all_files(root, patterns='*', single_level=False, yield_folders=False): + patterns = patterns.split(';') + for path, subdirs, files in os.walk(root): + if yield_folders: + files.extend(subdirs) + files.sort() + for name in files: + for pattern in patterns: + if fnmatch.fnmatch(name, pattern): + yield os.path.join(path, name) + break + if single_level: + break + +def date_files(filelist): + date_filename_list = [] + + for filename in filelist: + stats = os.stat(filename) + last_update = stats[8] + date_filename_tuple = last_update, filename + date_filename_list.append(date_filename_tuple) + + return date_filename_list + +def date_as_rfc(value): + return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(value)) + + +def build_rss(myitem,maxitem): + + RSSroot = ET.Element( 'rss', {'version':'2.0'} ) + RSSchannel = ET.SubElement( RSSroot, 'channel' ) + + ET.SubElement( RSSchannel, 'title' ).text = 'RSS feed of ' + str(title) + ET.SubElement( RSSchannel, 'link' ).text = link + ET.SubElement( RSSchannel, 'description' ).text = 'A directory RSSified by rssdir.py ' + version + ET.SubElement( RSSchannel, 'generator' ).text = 'A directory RSSified by rssdir.py ' + version + ET.SubElement( RSSchannel, 'pubDate' ).text = date_as_rfc(time.time()) + + for bloodyitem in myitem[0:maxitem]: + + RSSitem = ET.SubElement ( RSSchannel, 'item' ) + ET.SubElement( RSSitem, 'title' ).text = bloodyitem[1] + ET.SubElement( RSSitem, 'pubDate' ).text = date_as_rfc(bloodyitem[0]) + ET.SubElement( RSSitem, 'description').text = prefixurl+bloodyitem[1] + ET.SubElement( RSSitem, 'guid').text = prefixurl+bloodyitem[1] + + RSSfeed = ET.ElementTree(RSSroot) + feed = ET.tostring(RSSroot) + return feed + + +def complete_feed(myfeed): + + myheader = '' + return myheader + str(myfeed) + + +usage = "usage: %prog [options] directory" +parser = OptionParser(usage) + +parser.add_option("-p","--prefix",dest="prefix",help="http prefix to be used for each entry, default none") +parser.add_option("-t","--title",dest="title",help="set a title to the rss feed, default using prefix",type="string") +parser.add_option("-l","--link",dest="link",help="http link set, default is prefix and none if prefix not set") +parser.add_option("-m","--maxitem",dest="maxitem",help="maximum item to list in the feed, default 32",type="int") + +(options, args) = parser.parse_args() + +if options.prefix is None: + prefixurl = '' +else : + prefixurl = options.prefix + +if options.link is None: + link = options.prefix +else : + link = options.link + +if options.title is None: + title = options.prefix +else : + title = options.title + +if options.maxitem is None: + maxitem = 32 +else : + maxitem = options.maxitem + + +mylist = date_files(all_files(args[0])) + +mylist.sort() +mylist.reverse() + +print complete_feed(build_rss(mylist,maxitem)) + diff --git a/bin/rssinternetdraft.py b/bin/rssinternetdraft.py new file mode 100644 index 0000000..8105ba4 --- /dev/null +++ b/bin/rssinternetdraft.py @@ -0,0 +1,53 @@ +# +# quick-and-dirty(tm) script to gather IETF Internet-Draft announce +# from a mbox and to generate a nice RSS feed of the recent announce. +# +# for more information : http://www.foo.be/ietf/id/ + +import mailbox +import time +import re +import xml.etree.ElementTree as ET + +date_rfc2822 = "%a, %d %b %Y %H:%M:%S" + +tmsg = [] + +def date_as_rfc(value): + return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(value)) + +def build_rss(myitem,maxitem): + + RSSroot = ET.Element( 'rss', {'version':'2.0'} ) + RSSchannel = ET.SubElement( RSSroot, 'channel' ) + + ET.SubElement( RSSchannel, 'title' ).text = 'Latest Internet-Draft (IDs) Published - IETF - custom RSS feed' + ET.SubElement( RSSchannel, 'link' ).text = 'http://www.foo.be/ietf/id/' + ET.SubElement( RSSchannel, 'description' ).text = 'Latest Internet-Draft (IDs) Published - IETF - custom RSS feed' + ET.SubElement( RSSchannel, 'generator' ).text = 'rssany extended for parsing IETF IDs - http://www.foo.be/cgi-bin/wiki.pl/RssAny' +# ET.SubElement( RSSchannel, 'pubDate' ).text = date_as_rfc(time.time()) + ET.SubElement( RSSchannel, 'pubDate' ).text = date_as_rfc(time.time()-10000) + + for bloodyitem in myitem[0:maxitem]: + RSSitem = ET.SubElement ( RSSchannel, 'item' ) + ET.SubElement( RSSitem, 'title' ).text = bloodyitem[1] + ET.SubElement( RSSitem, 'pubDate' ).text = date_as_rfc(bloodyitem[0]) + ET.SubElement( RSSitem, 'description').text = '
'+bloodyitem[2]+'
' + ET.SubElement( RSSitem, 'guid').text = "http://tools.ietf.org/html/"+bloodyitem[3] + ET.SubElement( RSSitem, 'link').text = "http://tools.ietf.org/html/"+bloodyitem[3] + RSSfeed = ET.ElementTree(RSSroot) + feed = ET.tostring(RSSroot) + return feed + +for message in mailbox.mbox('/var/spool/mail/ietf'): + subject = message['subject'] + date = message['date'] + date_epoch = int(time.mktime(time.strptime(date[0:-12], date_rfc2822))) + message_id = message['Message-Id'] + body = message.get_payload()[0].get_payload() + id = subject.split(":")[1].split(".")[0] + tmsg.append([date_epoch,subject,body,id]) + +tmsg.sort() +tmsg.reverse() +print build_rss(tmsg,100) diff --git a/bin/rssmerge.py b/bin/rssmerge.py new file mode 100644 index 0000000..684e234 --- /dev/null +++ b/bin/rssmerge.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# a at foo dot be - Alexandre Dulaunoy - http://www.foo.be/cgi-bin/wiki.pl/RssAny +# +# rssmerge.py is a simple script to gather rss feed and merge them in reverse +# time order. Useful to keep track of recent events. +# +# this is still an early prototype and assume that you have full control of the +# remote rss feeds (if not you may have some security issues). +# +# TODO : - rss 2.0 and atom output +# - full html output +# +# example of use : +# python2.5 rssmerge.py --output phtml --maxitem 20 "http://www.foo.be/cgi-bin/wiki.pl?action=journal&tile=AdulauMessyDesk" +# "http://api.flickr.com/services/feeds/photos_public.gne?id=31797858@N00&lang=en-us&format=atom" "http://a.6f2.net/cgi-bin/gitweb.cgi? +# p=adulau/.git;a=rss" "http://www.librarything.com/rss/reviews/adulau" > /tmp/test.inc + +import feedparser +import sys,os +import time +import datetime +import md5 +from optparse import OptionParser +import cgi + +feedparser.USER_AGENT = "rssmerge.py +http://www.foo.be/" + +def RenderMerge(itemlist,output="text"): + + i = 0 + + if output == "text" : + for item in itemlist: + i = i + 1 + # Keep consistent datetime representation if not use allitem[item[1]]['updated'] + timetuple = datetime.datetime.fromtimestamp(allitem[item[1]]['epoch']) + + print str(i)+":"+allitem[item[1]]['title']+":"+timetuple.ctime()+":"+allitem[item[1]]['link'] + + if i == int(options.maxitem): + break + + if output == "phtml" : + print "" + + +usage = "usage: %prog [options] url" +parser = OptionParser(usage) + +parser.add_option("-m","--maxitem",dest="maxitem",help="maximum item to list in the feed, default 200") +parser.add_option("-o","--output",dest="output",help="output format (text, phtml), default text") + +#2007-11-10 11:25:51 +pattern = '%Y-%m-%d %H:%M:%S' + +(options, args) = parser.parse_args() + +if options.output == None: + options.output = "text" + +if options.maxitem == None: + options.maxitem = 200 + +allitem = {} + +for url in args: + + #print url + + d = feedparser.parse(url) + + for el in d.entries: + + eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.modified_parsed)) + elepoch = int(time.mktime(time.strptime(str(eldatetime), pattern))) + linkkey = md5.new(el.link).hexdigest() + allitem[linkkey] = {} + allitem[linkkey]['link'] = str(el.link) + allitem[linkkey]['epoch'] = int(elepoch) + allitem[linkkey]['updated'] = el.updated + allitem[linkkey]['title'] = el.title + + + +itemlist = [] + +for something in allitem.keys(): + epochkeytuple = (allitem[something]['epoch'],something) + itemlist.append (epochkeytuple) + +itemlist.sort() +itemlist.reverse() + +RenderMerge(itemlist,options.output)