chg: [rsscluster] Python2 to Python3 + updates

- Trying to resurect old script to merge rss
- Testing with Mastodon rss feed (which doesn't contain title) -> https://github.com/mastodon/mastodon/issues/18553
  - Maybe a need to remove completely the title (it seems an RSS feed
  can work without `title` field)
- Fixing some random bugs in this 11 years old code
This commit is contained in:
Alexandre Dulaunoy 2024-02-11 10:49:20 +01:00
parent fe766ccbbc
commit 54a82c0ce9
Signed by: adulau
GPG key ID: 09E2CD4944E6CBCD

View file

@ -7,50 +7,59 @@
# time interval (expressed in number of days). The maxitem is the # time interval (expressed in number of days). The maxitem is the
# number of item maximum after the clustering. # number of item maximum after the clustering.
# #
# an example use is for del.icio.us where you can have a lot of bookmarks during # an example use is for Mastodon where you can have a lot of toots during
# one day and you want to cluster them in one single item in RSS or in (X)HTML. # one day and you want to cluster them in one single item in RSS or in (X)HTML.
# #
# example of use : # example of use :
# python2.5 rsscluster.py --interval 5 --maxitem 20 "http://del.icio.us/rss/adulau" >adulau.xml # python3 rsscluster.py --interval 5 --maxitem 20 "https://paperbay.org/@a.rss" >adulau.xml
import feedparser import feedparser
import sys,os import sys, os
import time import time
import datetime import datetime
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import hashlib import hashlib
from optparse import OptionParser from optparse import OptionParser
#print sys.stdout.encoding # print sys.stdout.encoding
version = "0.2" version = "0.2"
feedparser.USER_AGENT = "rsscluster.py "+ version + " +http://www.foo.be/" feedparser.USER_AGENT = "rsscluster.py " + version + " +http://www.foo.be/"
def date_as_rfc(value): def date_as_rfc(value):
return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(value)) return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(value))
def build_rss(myitem,maxitem): def build_rss(myitem, maxitem):
RSSroot = ET.Element( 'rss', {'version':'2.0'} ) RSSroot = ET.Element("rss", {"version": "2.0"})
RSSchannel = ET.SubElement( RSSroot, 'channel' ) RSSchannel = ET.SubElement(RSSroot, "channel")
ET.SubElement( RSSchannel, 'title' ).text = 'RSS cluster of ' + str(url) +' per '+options.interval+' days' ET.SubElement(RSSchannel, "title").text = (
ET.SubElement( RSSchannel, 'link' ).text = str(url) "RSS cluster of " + str(url) + " per " + str(options.interval) + " days"
ET.SubElement( RSSchannel, 'description' ).text = 'RSS cluster of ' + str(url) +' per '+options.interval+' days' )
ET.SubElement( RSSchannel, 'generator' ).text = 'by rsscluster.py ' + version ET.SubElement(RSSchannel, "link").text = str(url)
ET.SubElement( RSSchannel, 'pubDate' ).text = date_as_rfc(time.time()) ET.SubElement(RSSchannel, "description").text = (
"RSS cluster of " + str(url) + " per " + str(options.interval) + " days"
)
ET.SubElement(RSSchannel, "generator").text = "by rsscluster.py " + version
ET.SubElement(RSSchannel, "pubDate").text = date_as_rfc(time.time())
for bloodyitem in myitem[0:maxitem]: for bloodyitem in myitem[0:maxitem]:
RSSitem = ET.SubElement ( RSSchannel, 'item' ) RSSitem = ET.SubElement(RSSchannel, "item")
ET.SubElement( RSSitem, 'title' ).text = 'clustered data of ' + date_as_rfc(float(bloodyitem[0])) +" for "+ str(url) ET.SubElement(RSSitem, "title").text = (
ET.SubElement( RSSitem, 'pubDate' ).text = date_as_rfc(float(bloodyitem[0])) "clustered data of "
ET.SubElement( RSSitem, 'description').text = bloodyitem[1] + date_as_rfc(float(bloodyitem[0]))
+ " for "
+ str(url)
)
ET.SubElement(RSSitem, "pubDate").text = date_as_rfc(float(bloodyitem[0]))
ET.SubElement(RSSitem, "description").text = bloodyitem[1]
h = hashlib.md5() h = hashlib.md5()
h.update(bloodyitem[1]) h.update(bloodyitem[1].encode("utf-8"))
ET.SubElement( RSSitem, 'guid').text = h.hexdigest() ET.SubElement(RSSitem, "guid").text = h.hexdigest()
RSSfeed = ET.ElementTree(RSSroot) RSSfeed = ET.ElementTree(RSSroot)
feed = ET.tostring(RSSroot) feed = ET.tostring(RSSroot)
@ -62,22 +71,35 @@ def complete_feed(myfeed):
myheader = '<?xml version="1.0"?>' myheader = '<?xml version="1.0"?>'
return myheader + str(myfeed) return myheader + str(myfeed)
def DaysInSec(val): def DaysInSec(val):
return int(val)*24*60*60 return int(val) * 24 * 60 * 60
usage = "usage: %prog [options] url" usage = "usage: %prog [options] url"
parser = OptionParser(usage) parser = OptionParser(usage)
parser.add_option("-m","--maxitem",dest="maxitem",help="maximum item to list in the feed, default 200") parser.add_option(
parser.add_option("-i","--interval",dest="interval",help="time interval expressed in days, default 1 day") "-m",
"--maxitem",
dest="maxitem",
help="maximum item to list in the feed, default 200",
)
parser.add_option(
"-i",
"--interval",
dest="interval",
help="time interval expressed in days, default 1 day",
)
#2007-11-10 11:25:51 # 2007-11-10 11:25:51
pattern = '%Y-%m-%d %H:%M:%S' pattern = "%Y-%m-%d %H:%M:%S"
(options, args) = parser.parse_args() (options, args) = parser.parse_args()
if options.interval == None: if options.interval is None:
options.interval = 1
options.output = 1 options.output = 1
if options.maxitem == None: if options.maxitem == None:
@ -93,6 +115,9 @@ url = args[0]
d = feedparser.parse(url) d = feedparser.parse(url)
if options.interval is None:
options.interval = 0
interval = DaysInSec(options.interval) interval = DaysInSec(options.interval)
previousepoch = [] previousepoch = []
@ -100,35 +125,52 @@ clusteredepoch = []
tcluster = [] tcluster = []
for el in d.entries: for el in d.entries:
if 'modified_parsed' in el:
eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.modified_parsed)) eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.modified_parsed))
elepoch = int(time.mktime(time.strptime(unicode(eldatetime), pattern))) else:
eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.published_parsed))
elepoch = int(time.mktime(time.strptime(str(eldatetime), pattern)))
if len(previousepoch): if len(previousepoch):
#print el.link, int(previousepoch[0])-int(elepoch), interval # print el.link, int(previousepoch[0])-int(elepoch), interval
if len(clusteredepoch): if len(clusteredepoch):
value = clusteredepoch.pop() value = clusteredepoch.pop()
else: else:
value = "" value = ""
if 'title' in el:
clusteredepoch.append(value + ' <a href="' + el.link + '">' + el.title + "</a>")
else:
clusteredepoch.append(value + ' <a href="' + el.link + '">' + el.summary + "</a>")
clusteredepoch.append(value+" <a href=\""+el.link+"\">"+el.title+"</a>") if not ((int(previousepoch[0]) - int(elepoch)) < interval):
if not ((int(previousepoch[0])-int(elepoch)) < interval):
value = clusteredepoch.pop() value = clusteredepoch.pop()
starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0]) starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0])
endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop()) endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop())
clusteredepoch.append(value+ " from: "+unicode(starttimetuple.ctime())+" to: "+unicode(endttimetuple.ctime())) clusteredepoch.append(
startdatelist = unicode(previousepoch[0]),unicode(clusteredepoch[len(clusteredepoch)-1]) value
+ " from: "
+ str(starttimetuple.ctime())
+ " to: "
+ str(endttimetuple.ctime())
)
if previousepoch:
startdatelist = str(previousepoch[0]), str(
clusteredepoch[len(clusteredepoch) - 1]
)
tcluster.append(startdatelist) tcluster.append(startdatelist)
del previousepoch[0:len(previousepoch)] del previousepoch[0 : len(previousepoch)]
del clusteredepoch[0:len(clusteredepoch)] del clusteredepoch[0 : len(clusteredepoch)]
else: else:
clusteredepoch.append(" <a href=\""+el.link+"\">"+el.title+"</a>") if 'title' in el:
clusteredepoch.append(' <a href="' + el.link + '">' + el.title + "</a>")
else:
clusteredepoch.append(' <a href="' + el.link + '">' + el.summary + "</a>")
previousepoch.append(elepoch) previousepoch.append(elepoch)
# if last cluster list was not complete, we add the time period information. # if last cluster list was not complete, we add the time period information.
@ -136,13 +178,16 @@ if len(previousepoch):
value = clusteredepoch.pop() value = clusteredepoch.pop()
starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0]) starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0])
endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop()) endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop())
clusteredepoch.append(value+ " from: "+unicode(starttimetuple.ctime())+" to: "+unicode(endttimetuple.ctime())) clusteredepoch.append(
del previousepoch[0:len(previousepoch)] value
+ " from: "
+ str(starttimetuple.ctime())
+ " to: "
+ str(endttimetuple.ctime())
)
del previousepoch[0 : len(previousepoch)]
tcluster.sort() tcluster.sort()
tcluster.reverse() tcluster.reverse()
print complete_feed(build_rss(tcluster,int(options.maxitem))) print(complete_feed(build_rss(tcluster, int(options.maxitem))))