chg: [rsscluster] Python2 to Python3 + updates

- Trying to resurect old script to merge rss
- Testing with Mastodon rss feed (which doesn't contain title) -> https://github.com/mastodon/mastodon/issues/18553
  - Maybe a need to remove completely the title (it seems an RSS feed
  can work without `title` field)
- Fixing some random bugs in this 11 years old code
This commit is contained in:
Alexandre Dulaunoy 2024-02-11 10:49:20 +01:00
parent fe766ccbbc
commit 54a82c0ce9
Signed by: adulau
GPG key ID: 09E2CD4944E6CBCD

View file

@ -7,11 +7,11 @@
# time interval (expressed in number of days). The maxitem is the # time interval (expressed in number of days). The maxitem is the
# number of item maximum after the clustering. # number of item maximum after the clustering.
# #
# an example use is for del.icio.us where you can have a lot of bookmarks during # an example use is for Mastodon where you can have a lot of toots during
# one day and you want to cluster them in one single item in RSS or in (X)HTML. # one day and you want to cluster them in one single item in RSS or in (X)HTML.
# #
# example of use : # example of use :
# python2.5 rsscluster.py --interval 5 --maxitem 20 "http://del.icio.us/rss/adulau" >adulau.xml # python3 rsscluster.py --interval 5 --maxitem 20 "https://paperbay.org/@a.rss" >adulau.xml
import feedparser import feedparser
import sys, os import sys, os
@ -33,24 +33,33 @@ def date_as_rfc(value):
def build_rss(myitem, maxitem): def build_rss(myitem, maxitem):
RSSroot = ET.Element( 'rss', {'version':'2.0'} ) RSSroot = ET.Element("rss", {"version": "2.0"})
RSSchannel = ET.SubElement( RSSroot, 'channel' ) RSSchannel = ET.SubElement(RSSroot, "channel")
ET.SubElement( RSSchannel, 'title' ).text = 'RSS cluster of ' + str(url) +' per '+options.interval+' days' ET.SubElement(RSSchannel, "title").text = (
ET.SubElement( RSSchannel, 'link' ).text = str(url) "RSS cluster of " + str(url) + " per " + str(options.interval) + " days"
ET.SubElement( RSSchannel, 'description' ).text = 'RSS cluster of ' + str(url) +' per '+options.interval+' days' )
ET.SubElement( RSSchannel, 'generator' ).text = 'by rsscluster.py ' + version ET.SubElement(RSSchannel, "link").text = str(url)
ET.SubElement( RSSchannel, 'pubDate' ).text = date_as_rfc(time.time()) ET.SubElement(RSSchannel, "description").text = (
"RSS cluster of " + str(url) + " per " + str(options.interval) + " days"
)
ET.SubElement(RSSchannel, "generator").text = "by rsscluster.py " + version
ET.SubElement(RSSchannel, "pubDate").text = date_as_rfc(time.time())
for bloodyitem in myitem[0:maxitem]: for bloodyitem in myitem[0:maxitem]:
RSSitem = ET.SubElement ( RSSchannel, 'item' ) RSSitem = ET.SubElement(RSSchannel, "item")
ET.SubElement( RSSitem, 'title' ).text = 'clustered data of ' + date_as_rfc(float(bloodyitem[0])) +" for "+ str(url) ET.SubElement(RSSitem, "title").text = (
ET.SubElement( RSSitem, 'pubDate' ).text = date_as_rfc(float(bloodyitem[0])) "clustered data of "
ET.SubElement( RSSitem, 'description').text = bloodyitem[1] + date_as_rfc(float(bloodyitem[0]))
+ " for "
+ str(url)
)
ET.SubElement(RSSitem, "pubDate").text = date_as_rfc(float(bloodyitem[0]))
ET.SubElement(RSSitem, "description").text = bloodyitem[1]
h = hashlib.md5() h = hashlib.md5()
h.update(bloodyitem[1]) h.update(bloodyitem[1].encode("utf-8"))
ET.SubElement( RSSitem, 'guid').text = h.hexdigest() ET.SubElement(RSSitem, "guid").text = h.hexdigest()
RSSfeed = ET.ElementTree(RSSroot) RSSfeed = ET.ElementTree(RSSroot)
feed = ET.tostring(RSSroot) feed = ET.tostring(RSSroot)
@ -62,22 +71,35 @@ def complete_feed(myfeed):
myheader = '<?xml version="1.0"?>' myheader = '<?xml version="1.0"?>'
return myheader + str(myfeed) return myheader + str(myfeed)
def DaysInSec(val): def DaysInSec(val):
return int(val) * 24 * 60 * 60 return int(val) * 24 * 60 * 60
usage = "usage: %prog [options] url" usage = "usage: %prog [options] url"
parser = OptionParser(usage) parser = OptionParser(usage)
parser.add_option("-m","--maxitem",dest="maxitem",help="maximum item to list in the feed, default 200") parser.add_option(
parser.add_option("-i","--interval",dest="interval",help="time interval expressed in days, default 1 day") "-m",
"--maxitem",
dest="maxitem",
help="maximum item to list in the feed, default 200",
)
parser.add_option(
"-i",
"--interval",
dest="interval",
help="time interval expressed in days, default 1 day",
)
# 2007-11-10 11:25:51 # 2007-11-10 11:25:51
pattern = '%Y-%m-%d %H:%M:%S' pattern = "%Y-%m-%d %H:%M:%S"
(options, args) = parser.parse_args() (options, args) = parser.parse_args()
if options.interval == None: if options.interval is None:
options.interval = 1
options.output = 1 options.output = 1
if options.maxitem == None: if options.maxitem == None:
@ -93,6 +115,9 @@ url = args[0]
d = feedparser.parse(url) d = feedparser.parse(url)
if options.interval is None:
options.interval = 0
interval = DaysInSec(options.interval) interval = DaysInSec(options.interval)
previousepoch = [] previousepoch = []
@ -100,9 +125,12 @@ clusteredepoch = []
tcluster = [] tcluster = []
for el in d.entries: for el in d.entries:
if 'modified_parsed' in el:
eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.modified_parsed)) eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.modified_parsed))
elepoch = int(time.mktime(time.strptime(unicode(eldatetime), pattern))) else:
eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.published_parsed))
elepoch = int(time.mktime(time.strptime(str(eldatetime), pattern)))
if len(previousepoch): if len(previousepoch):
@ -112,9 +140,10 @@ for el in d.entries:
value = clusteredepoch.pop() value = clusteredepoch.pop()
else: else:
value = "" value = ""
if 'title' in el:
clusteredepoch.append(value+" <a href=\""+el.link+"\">"+el.title+"</a>") clusteredepoch.append(value + ' <a href="' + el.link + '">' + el.title + "</a>")
else:
clusteredepoch.append(value + ' <a href="' + el.link + '">' + el.summary + "</a>")
if not ((int(previousepoch[0]) - int(elepoch)) < interval): if not ((int(previousepoch[0]) - int(elepoch)) < interval):
@ -122,13 +151,26 @@ for el in d.entries:
starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0]) starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0])
endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop()) endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop())
clusteredepoch.append(value+ " from: "+unicode(starttimetuple.ctime())+" to: "+unicode(endttimetuple.ctime())) clusteredepoch.append(
startdatelist = unicode(previousepoch[0]),unicode(clusteredepoch[len(clusteredepoch)-1]) value
+ " from: "
+ str(starttimetuple.ctime())
+ " to: "
+ str(endttimetuple.ctime())
)
if previousepoch:
startdatelist = str(previousepoch[0]), str(
clusteredepoch[len(clusteredepoch) - 1]
)
tcluster.append(startdatelist) tcluster.append(startdatelist)
del previousepoch[0 : len(previousepoch)] del previousepoch[0 : len(previousepoch)]
del clusteredepoch[0 : len(clusteredepoch)] del clusteredepoch[0 : len(clusteredepoch)]
else: else:
clusteredepoch.append(" <a href=\""+el.link+"\">"+el.title+"</a>") if 'title' in el:
clusteredepoch.append(' <a href="' + el.link + '">' + el.title + "</a>")
else:
clusteredepoch.append(' <a href="' + el.link + '">' + el.summary + "</a>")
previousepoch.append(elepoch) previousepoch.append(elepoch)
# if last cluster list was not complete, we add the time period information. # if last cluster list was not complete, we add the time period information.
@ -136,13 +178,16 @@ if len(previousepoch):
value = clusteredepoch.pop() value = clusteredepoch.pop()
starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0]) starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0])
endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop()) endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop())
clusteredepoch.append(value+ " from: "+unicode(starttimetuple.ctime())+" to: "+unicode(endttimetuple.ctime())) clusteredepoch.append(
value
+ " from: "
+ str(starttimetuple.ctime())
+ " to: "
+ str(endttimetuple.ctime())
)
del previousepoch[0 : len(previousepoch)] del previousepoch[0 : len(previousepoch)]
tcluster.sort() tcluster.sort()
tcluster.reverse() tcluster.reverse()
print complete_feed(build_rss(tcluster,int(options.maxitem))) print(complete_feed(build_rss(tcluster, int(options.maxitem))))