From 54a82c0ce92082ccbc8e05526da399ac081fd656 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Sun, 11 Feb 2024 10:49:20 +0100 Subject: [PATCH] chg: [rsscluster] Python2 to Python3 + updates - Trying to resurect old script to merge rss - Testing with Mastodon rss feed (which doesn't contain title) -> https://github.com/mastodon/mastodon/issues/18553 - Maybe a need to remove completely the title (it seems an RSS feed can work without `title` field) - Fixing some random bugs in this 11 years old code --- bin/rsscluster.py | 155 ++++++++++++++++++++++++++++++---------------- 1 file changed, 100 insertions(+), 55 deletions(-) diff --git a/bin/rsscluster.py b/bin/rsscluster.py index 43e4f77..b6e2d8a 100644 --- a/bin/rsscluster.py +++ b/bin/rsscluster.py @@ -7,81 +7,103 @@ # time interval (expressed in number of days). The maxitem is the # number of item maximum after the clustering. # -# an example use is for del.icio.us where you can have a lot of bookmarks during +# an example use is for Mastodon where you can have a lot of toots during # one day and you want to cluster them in one single item in RSS or in (X)HTML. -# -# example of use : -# python2.5 rsscluster.py --interval 5 --maxitem 20 "http://del.icio.us/rss/adulau" >adulau.xml +# +# example of use : +# python3 rsscluster.py --interval 5 --maxitem 20 "https://paperbay.org/@a.rss" >adulau.xml import feedparser -import sys,os +import sys, os import time import datetime import xml.etree.ElementTree as ET import hashlib from optparse import OptionParser -#print sys.stdout.encoding +# print sys.stdout.encoding version = "0.2" -feedparser.USER_AGENT = "rsscluster.py "+ version + " +http://www.foo.be/" +feedparser.USER_AGENT = "rsscluster.py " + version + " +http://www.foo.be/" def date_as_rfc(value): - return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(value)) + return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(value)) -def build_rss(myitem,maxitem): +def build_rss(myitem, maxitem): - RSSroot = ET.Element( 'rss', {'version':'2.0'} ) - RSSchannel = ET.SubElement( RSSroot, 'channel' ) + RSSroot = ET.Element("rss", {"version": "2.0"}) + RSSchannel = ET.SubElement(RSSroot, "channel") - ET.SubElement( RSSchannel, 'title' ).text = 'RSS cluster of ' + str(url) +' per '+options.interval+' days' - ET.SubElement( RSSchannel, 'link' ).text = str(url) - ET.SubElement( RSSchannel, 'description' ).text = 'RSS cluster of ' + str(url) +' per '+options.interval+' days' - ET.SubElement( RSSchannel, 'generator' ).text = 'by rsscluster.py ' + version - ET.SubElement( RSSchannel, 'pubDate' ).text = date_as_rfc(time.time()) + ET.SubElement(RSSchannel, "title").text = ( + "RSS cluster of " + str(url) + " per " + str(options.interval) + " days" + ) + ET.SubElement(RSSchannel, "link").text = str(url) + ET.SubElement(RSSchannel, "description").text = ( + "RSS cluster of " + str(url) + " per " + str(options.interval) + " days" + ) + ET.SubElement(RSSchannel, "generator").text = "by rsscluster.py " + version + ET.SubElement(RSSchannel, "pubDate").text = date_as_rfc(time.time()) - for bloodyitem in myitem[0:maxitem]: + for bloodyitem in myitem[0:maxitem]: - RSSitem = ET.SubElement ( RSSchannel, 'item' ) - ET.SubElement( RSSitem, 'title' ).text = 'clustered data of ' + date_as_rfc(float(bloodyitem[0])) +" for "+ str(url) - ET.SubElement( RSSitem, 'pubDate' ).text = date_as_rfc(float(bloodyitem[0])) - ET.SubElement( RSSitem, 'description').text = bloodyitem[1] + RSSitem = ET.SubElement(RSSchannel, "item") + ET.SubElement(RSSitem, "title").text = ( + "clustered data of " + + date_as_rfc(float(bloodyitem[0])) + + " for " + + str(url) + ) + ET.SubElement(RSSitem, "pubDate").text = date_as_rfc(float(bloodyitem[0])) + ET.SubElement(RSSitem, "description").text = bloodyitem[1] h = hashlib.md5() - h.update(bloodyitem[1]) - ET.SubElement( RSSitem, 'guid').text = h.hexdigest() + h.update(bloodyitem[1].encode("utf-8")) + ET.SubElement(RSSitem, "guid").text = h.hexdigest() - RSSfeed = ET.ElementTree(RSSroot) - feed = ET.tostring(RSSroot) - return feed + RSSfeed = ET.ElementTree(RSSroot) + feed = ET.tostring(RSSroot) + return feed def complete_feed(myfeed): - myheader = '' - return myheader + str(myfeed) + myheader = '' + return myheader + str(myfeed) + def DaysInSec(val): - return int(val)*24*60*60 + return int(val) * 24 * 60 * 60 + usage = "usage: %prog [options] url" parser = OptionParser(usage) -parser.add_option("-m","--maxitem",dest="maxitem",help="maximum item to list in the feed, default 200") -parser.add_option("-i","--interval",dest="interval",help="time interval expressed in days, default 1 day") +parser.add_option( + "-m", + "--maxitem", + dest="maxitem", + help="maximum item to list in the feed, default 200", +) +parser.add_option( + "-i", + "--interval", + dest="interval", + help="time interval expressed in days, default 1 day", +) -#2007-11-10 11:25:51 -pattern = '%Y-%m-%d %H:%M:%S' +# 2007-11-10 11:25:51 +pattern = "%Y-%m-%d %H:%M:%S" (options, args) = parser.parse_args() -if options.interval == None: - options.output = 1 +if options.interval is None: + options.interval = 1 + options.output = 1 if options.maxitem == None: - options.maxitem = 200 + options.maxitem = 200 if len(args) != 1: @@ -93,6 +115,9 @@ url = args[0] d = feedparser.parse(url) +if options.interval is None: + options.interval = 0 + interval = DaysInSec(options.interval) previousepoch = [] @@ -100,35 +125,52 @@ clusteredepoch = [] tcluster = [] for el in d.entries: + if 'modified_parsed' in el: + eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.modified_parsed)) + else: + eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.published_parsed)) - eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.modified_parsed)) - elepoch = int(time.mktime(time.strptime(unicode(eldatetime), pattern))) + elepoch = int(time.mktime(time.strptime(str(eldatetime), pattern))) if len(previousepoch): - #print el.link, int(previousepoch[0])-int(elepoch), interval + # print el.link, int(previousepoch[0])-int(elepoch), interval if len(clusteredepoch): value = clusteredepoch.pop() else: value = "" + if 'title' in el: + clusteredepoch.append(value + ' ' + el.title + "") + else: + clusteredepoch.append(value + ' ' + el.summary + "") - clusteredepoch.append(value+" "+el.title+"") - - - if not ((int(previousepoch[0])-int(elepoch)) < interval): + if not ((int(previousepoch[0]) - int(elepoch)) < interval): value = clusteredepoch.pop() starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0]) endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop()) - clusteredepoch.append(value+ " from: "+unicode(starttimetuple.ctime())+" to: "+unicode(endttimetuple.ctime())) - startdatelist = unicode(previousepoch[0]),unicode(clusteredepoch[len(clusteredepoch)-1]) - tcluster.append(startdatelist) - del previousepoch[0:len(previousepoch)] - del clusteredepoch[0:len(clusteredepoch)] + clusteredepoch.append( + value + + " from: " + + str(starttimetuple.ctime()) + + " to: " + + str(endttimetuple.ctime()) + ) + if previousepoch: + startdatelist = str(previousepoch[0]), str( + clusteredepoch[len(clusteredepoch) - 1] + ) + tcluster.append(startdatelist) + del previousepoch[0 : len(previousepoch)] + del clusteredepoch[0 : len(clusteredepoch)] else: - clusteredepoch.append(" "+el.title+"") + if 'title' in el: + clusteredepoch.append(' ' + el.title + "") + else: + clusteredepoch.append(' ' + el.summary + "") + previousepoch.append(elepoch) # if last cluster list was not complete, we add the time period information. @@ -136,13 +178,16 @@ if len(previousepoch): value = clusteredepoch.pop() starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0]) endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop()) - clusteredepoch.append(value+ " from: "+unicode(starttimetuple.ctime())+" to: "+unicode(endttimetuple.ctime())) - del previousepoch[0:len(previousepoch)] + clusteredepoch.append( + value + + " from: " + + str(starttimetuple.ctime()) + + " to: " + + str(endttimetuple.ctime()) + ) + del previousepoch[0 : len(previousepoch)] tcluster.sort() tcluster.reverse() -print complete_feed(build_rss(tcluster,int(options.maxitem))) - - - +print(complete_feed(build_rss(tcluster, int(options.maxitem))))