mirror of
https://github.com/adulau/rss-tools.git
synced 2024-11-07 12:06:25 +00:00
chg: [rsscluster] Python2 to Python3 + updates
- Trying to resurect old script to merge rss - Testing with Mastodon rss feed (which doesn't contain title) -> https://github.com/mastodon/mastodon/issues/18553 - Maybe a need to remove completely the title (it seems an RSS feed can work without `title` field) - Fixing some random bugs in this 11 years old code
This commit is contained in:
parent
fe766ccbbc
commit
54a82c0ce9
1 changed files with 100 additions and 55 deletions
|
@ -7,81 +7,103 @@
|
||||||
# time interval (expressed in number of days). The maxitem is the
|
# time interval (expressed in number of days). The maxitem is the
|
||||||
# number of item maximum after the clustering.
|
# number of item maximum after the clustering.
|
||||||
#
|
#
|
||||||
# an example use is for del.icio.us where you can have a lot of bookmarks during
|
# an example use is for Mastodon where you can have a lot of toots during
|
||||||
# one day and you want to cluster them in one single item in RSS or in (X)HTML.
|
# one day and you want to cluster them in one single item in RSS or in (X)HTML.
|
||||||
#
|
#
|
||||||
# example of use :
|
# example of use :
|
||||||
# python2.5 rsscluster.py --interval 5 --maxitem 20 "http://del.icio.us/rss/adulau" >adulau.xml
|
# python3 rsscluster.py --interval 5 --maxitem 20 "https://paperbay.org/@a.rss" >adulau.xml
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
import sys,os
|
import sys, os
|
||||||
import time
|
import time
|
||||||
import datetime
|
import datetime
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
import hashlib
|
import hashlib
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
|
|
||||||
#print sys.stdout.encoding
|
# print sys.stdout.encoding
|
||||||
version = "0.2"
|
version = "0.2"
|
||||||
|
|
||||||
feedparser.USER_AGENT = "rsscluster.py "+ version + " +http://www.foo.be/"
|
feedparser.USER_AGENT = "rsscluster.py " + version + " +http://www.foo.be/"
|
||||||
|
|
||||||
|
|
||||||
def date_as_rfc(value):
|
def date_as_rfc(value):
|
||||||
return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(value))
|
return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(value))
|
||||||
|
|
||||||
|
|
||||||
def build_rss(myitem,maxitem):
|
def build_rss(myitem, maxitem):
|
||||||
|
|
||||||
RSSroot = ET.Element( 'rss', {'version':'2.0'} )
|
RSSroot = ET.Element("rss", {"version": "2.0"})
|
||||||
RSSchannel = ET.SubElement( RSSroot, 'channel' )
|
RSSchannel = ET.SubElement(RSSroot, "channel")
|
||||||
|
|
||||||
ET.SubElement( RSSchannel, 'title' ).text = 'RSS cluster of ' + str(url) +' per '+options.interval+' days'
|
ET.SubElement(RSSchannel, "title").text = (
|
||||||
ET.SubElement( RSSchannel, 'link' ).text = str(url)
|
"RSS cluster of " + str(url) + " per " + str(options.interval) + " days"
|
||||||
ET.SubElement( RSSchannel, 'description' ).text = 'RSS cluster of ' + str(url) +' per '+options.interval+' days'
|
)
|
||||||
ET.SubElement( RSSchannel, 'generator' ).text = 'by rsscluster.py ' + version
|
ET.SubElement(RSSchannel, "link").text = str(url)
|
||||||
ET.SubElement( RSSchannel, 'pubDate' ).text = date_as_rfc(time.time())
|
ET.SubElement(RSSchannel, "description").text = (
|
||||||
|
"RSS cluster of " + str(url) + " per " + str(options.interval) + " days"
|
||||||
|
)
|
||||||
|
ET.SubElement(RSSchannel, "generator").text = "by rsscluster.py " + version
|
||||||
|
ET.SubElement(RSSchannel, "pubDate").text = date_as_rfc(time.time())
|
||||||
|
|
||||||
for bloodyitem in myitem[0:maxitem]:
|
for bloodyitem in myitem[0:maxitem]:
|
||||||
|
|
||||||
RSSitem = ET.SubElement ( RSSchannel, 'item' )
|
RSSitem = ET.SubElement(RSSchannel, "item")
|
||||||
ET.SubElement( RSSitem, 'title' ).text = 'clustered data of ' + date_as_rfc(float(bloodyitem[0])) +" for "+ str(url)
|
ET.SubElement(RSSitem, "title").text = (
|
||||||
ET.SubElement( RSSitem, 'pubDate' ).text = date_as_rfc(float(bloodyitem[0]))
|
"clustered data of "
|
||||||
ET.SubElement( RSSitem, 'description').text = bloodyitem[1]
|
+ date_as_rfc(float(bloodyitem[0]))
|
||||||
|
+ " for "
|
||||||
|
+ str(url)
|
||||||
|
)
|
||||||
|
ET.SubElement(RSSitem, "pubDate").text = date_as_rfc(float(bloodyitem[0]))
|
||||||
|
ET.SubElement(RSSitem, "description").text = bloodyitem[1]
|
||||||
h = hashlib.md5()
|
h = hashlib.md5()
|
||||||
h.update(bloodyitem[1])
|
h.update(bloodyitem[1].encode("utf-8"))
|
||||||
ET.SubElement( RSSitem, 'guid').text = h.hexdigest()
|
ET.SubElement(RSSitem, "guid").text = h.hexdigest()
|
||||||
|
|
||||||
RSSfeed = ET.ElementTree(RSSroot)
|
RSSfeed = ET.ElementTree(RSSroot)
|
||||||
feed = ET.tostring(RSSroot)
|
feed = ET.tostring(RSSroot)
|
||||||
return feed
|
return feed
|
||||||
|
|
||||||
|
|
||||||
def complete_feed(myfeed):
|
def complete_feed(myfeed):
|
||||||
|
|
||||||
myheader = '<?xml version="1.0"?>'
|
myheader = '<?xml version="1.0"?>'
|
||||||
return myheader + str(myfeed)
|
return myheader + str(myfeed)
|
||||||
|
|
||||||
|
|
||||||
def DaysInSec(val):
|
def DaysInSec(val):
|
||||||
|
|
||||||
return int(val)*24*60*60
|
return int(val) * 24 * 60 * 60
|
||||||
|
|
||||||
|
|
||||||
usage = "usage: %prog [options] url"
|
usage = "usage: %prog [options] url"
|
||||||
parser = OptionParser(usage)
|
parser = OptionParser(usage)
|
||||||
|
|
||||||
parser.add_option("-m","--maxitem",dest="maxitem",help="maximum item to list in the feed, default 200")
|
parser.add_option(
|
||||||
parser.add_option("-i","--interval",dest="interval",help="time interval expressed in days, default 1 day")
|
"-m",
|
||||||
|
"--maxitem",
|
||||||
|
dest="maxitem",
|
||||||
|
help="maximum item to list in the feed, default 200",
|
||||||
|
)
|
||||||
|
parser.add_option(
|
||||||
|
"-i",
|
||||||
|
"--interval",
|
||||||
|
dest="interval",
|
||||||
|
help="time interval expressed in days, default 1 day",
|
||||||
|
)
|
||||||
|
|
||||||
#2007-11-10 11:25:51
|
# 2007-11-10 11:25:51
|
||||||
pattern = '%Y-%m-%d %H:%M:%S'
|
pattern = "%Y-%m-%d %H:%M:%S"
|
||||||
|
|
||||||
(options, args) = parser.parse_args()
|
(options, args) = parser.parse_args()
|
||||||
|
|
||||||
if options.interval == None:
|
if options.interval is None:
|
||||||
options.output = 1
|
options.interval = 1
|
||||||
|
options.output = 1
|
||||||
|
|
||||||
if options.maxitem == None:
|
if options.maxitem == None:
|
||||||
options.maxitem = 200
|
options.maxitem = 200
|
||||||
|
|
||||||
|
|
||||||
if len(args) != 1:
|
if len(args) != 1:
|
||||||
|
@ -93,6 +115,9 @@ url = args[0]
|
||||||
|
|
||||||
d = feedparser.parse(url)
|
d = feedparser.parse(url)
|
||||||
|
|
||||||
|
if options.interval is None:
|
||||||
|
options.interval = 0
|
||||||
|
|
||||||
interval = DaysInSec(options.interval)
|
interval = DaysInSec(options.interval)
|
||||||
|
|
||||||
previousepoch = []
|
previousepoch = []
|
||||||
|
@ -100,35 +125,52 @@ clusteredepoch = []
|
||||||
tcluster = []
|
tcluster = []
|
||||||
|
|
||||||
for el in d.entries:
|
for el in d.entries:
|
||||||
|
if 'modified_parsed' in el:
|
||||||
|
eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.modified_parsed))
|
||||||
|
else:
|
||||||
|
eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.published_parsed))
|
||||||
|
|
||||||
eldatetime = datetime.datetime.fromtimestamp(time.mktime(el.modified_parsed))
|
elepoch = int(time.mktime(time.strptime(str(eldatetime), pattern)))
|
||||||
elepoch = int(time.mktime(time.strptime(unicode(eldatetime), pattern)))
|
|
||||||
|
|
||||||
if len(previousepoch):
|
if len(previousepoch):
|
||||||
|
|
||||||
#print el.link, int(previousepoch[0])-int(elepoch), interval
|
# print el.link, int(previousepoch[0])-int(elepoch), interval
|
||||||
|
|
||||||
if len(clusteredepoch):
|
if len(clusteredepoch):
|
||||||
value = clusteredepoch.pop()
|
value = clusteredepoch.pop()
|
||||||
else:
|
else:
|
||||||
value = ""
|
value = ""
|
||||||
|
if 'title' in el:
|
||||||
|
clusteredepoch.append(value + ' <a href="' + el.link + '">' + el.title + "</a>")
|
||||||
|
else:
|
||||||
|
clusteredepoch.append(value + ' <a href="' + el.link + '">' + el.summary + "</a>")
|
||||||
|
|
||||||
clusteredepoch.append(value+" <a href=\""+el.link+"\">"+el.title+"</a>")
|
if not ((int(previousepoch[0]) - int(elepoch)) < interval):
|
||||||
|
|
||||||
|
|
||||||
if not ((int(previousepoch[0])-int(elepoch)) < interval):
|
|
||||||
|
|
||||||
value = clusteredepoch.pop()
|
value = clusteredepoch.pop()
|
||||||
|
|
||||||
starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0])
|
starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0])
|
||||||
endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop())
|
endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop())
|
||||||
clusteredepoch.append(value+ " from: "+unicode(starttimetuple.ctime())+" to: "+unicode(endttimetuple.ctime()))
|
clusteredepoch.append(
|
||||||
startdatelist = unicode(previousepoch[0]),unicode(clusteredepoch[len(clusteredepoch)-1])
|
value
|
||||||
tcluster.append(startdatelist)
|
+ " from: "
|
||||||
del previousepoch[0:len(previousepoch)]
|
+ str(starttimetuple.ctime())
|
||||||
del clusteredepoch[0:len(clusteredepoch)]
|
+ " to: "
|
||||||
|
+ str(endttimetuple.ctime())
|
||||||
|
)
|
||||||
|
if previousepoch:
|
||||||
|
startdatelist = str(previousepoch[0]), str(
|
||||||
|
clusteredepoch[len(clusteredepoch) - 1]
|
||||||
|
)
|
||||||
|
tcluster.append(startdatelist)
|
||||||
|
del previousepoch[0 : len(previousepoch)]
|
||||||
|
del clusteredepoch[0 : len(clusteredepoch)]
|
||||||
else:
|
else:
|
||||||
clusteredepoch.append(" <a href=\""+el.link+"\">"+el.title+"</a>")
|
if 'title' in el:
|
||||||
|
clusteredepoch.append(' <a href="' + el.link + '">' + el.title + "</a>")
|
||||||
|
else:
|
||||||
|
clusteredepoch.append(' <a href="' + el.link + '">' + el.summary + "</a>")
|
||||||
|
|
||||||
previousepoch.append(elepoch)
|
previousepoch.append(elepoch)
|
||||||
|
|
||||||
# if last cluster list was not complete, we add the time period information.
|
# if last cluster list was not complete, we add the time period information.
|
||||||
|
@ -136,13 +178,16 @@ if len(previousepoch):
|
||||||
value = clusteredepoch.pop()
|
value = clusteredepoch.pop()
|
||||||
starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0])
|
starttimetuple = datetime.datetime.fromtimestamp(previousepoch[0])
|
||||||
endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop())
|
endttimetuple = datetime.datetime.fromtimestamp(previousepoch.pop())
|
||||||
clusteredepoch.append(value+ " from: "+unicode(starttimetuple.ctime())+" to: "+unicode(endttimetuple.ctime()))
|
clusteredepoch.append(
|
||||||
del previousepoch[0:len(previousepoch)]
|
value
|
||||||
|
+ " from: "
|
||||||
|
+ str(starttimetuple.ctime())
|
||||||
|
+ " to: "
|
||||||
|
+ str(endttimetuple.ctime())
|
||||||
|
)
|
||||||
|
del previousepoch[0 : len(previousepoch)]
|
||||||
|
|
||||||
|
|
||||||
tcluster.sort()
|
tcluster.sort()
|
||||||
tcluster.reverse()
|
tcluster.reverse()
|
||||||
print complete_feed(build_rss(tcluster,int(options.maxitem)))
|
print(complete_feed(build_rss(tcluster, int(options.maxitem))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue