2013-04-14 12:05:30 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
#
|
2024-03-09 17:22:30 +00:00
|
|
|
# a at foo dot be - Alexandre Dulaunoy - https://git.foo.be/adulau/rss-tools
|
2013-04-14 12:05:30 +00:00
|
|
|
#
|
2024-03-09 17:22:30 +00:00
|
|
|
# rssmerge.py is a simple script designed to aggregate RSS feeds and merge them in reverse chronological order.
|
|
|
|
# It outputs the merged content in text, HTML, or Markdown format. This tool is useful for tracking recent events
|
|
|
|
# from various feeds and publishing them on your website.
|
2024-02-11 10:17:49 +00:00
|
|
|
#
|
2024-03-09 17:22:30 +00:00
|
|
|
# Sample usage:
|
2024-02-11 10:17:49 +00:00
|
|
|
#
|
2024-03-09 17:22:30 +00:00
|
|
|
# python3 rssmerge.py "https://git.foo.be/adulau.rss" "http://api.flickr.com/services/feeds/photos_public.gne?id=31797858@N00&lang=en-us&format=atom"
|
|
|
|
# "https://github.com/adulau.atom" -o markdown --maxitem 20
|
2013-04-14 12:05:30 +00:00
|
|
|
|
|
|
|
import feedparser
|
2024-02-11 10:17:49 +00:00
|
|
|
import sys, os
|
2013-04-14 12:05:30 +00:00
|
|
|
import time
|
|
|
|
import datetime
|
2024-02-11 10:17:49 +00:00
|
|
|
import hashlib
|
2013-04-14 12:05:30 +00:00
|
|
|
from optparse import OptionParser
|
2024-02-11 10:17:49 +00:00
|
|
|
import html
|
2024-02-11 10:54:31 +00:00
|
|
|
from bs4 import BeautifulSoup
|
2024-03-09 17:22:30 +00:00
|
|
|
from urllib.parse import urlparse
|
2013-04-14 12:05:30 +00:00
|
|
|
|
2024-02-11 15:32:40 +00:00
|
|
|
feedparser.USER_AGENT = "rssmerge.py +https://github.com/adulau/rss-tools"
|
2013-04-14 12:05:30 +00:00
|
|
|
|
|
|
|
|
2024-02-11 10:17:49 +00:00
|
|
|
def RenderMerge(itemlist, output="text"):
|
|
|
|
i = 0
|
|
|
|
if output == "text":
|
|
|
|
for item in itemlist:
|
|
|
|
i = i + 1
|
|
|
|
# Keep consistent datetime representation if not use allitem[item[1]]['updated']
|
2024-03-09 17:22:30 +00:00
|
|
|
link = allitem[item[1]]["link"]
|
|
|
|
title = html.escape(allitem[item[1]]["title"])
|
|
|
|
timestamp = datetime.datetime.fromtimestamp(
|
|
|
|
allitem[item[1]]["epoch"]
|
|
|
|
).ctime()
|
|
|
|
print(f'{i}:{title}:{timestamp}:{link}')
|
2024-02-11 10:17:49 +00:00
|
|
|
|
|
|
|
if i == int(options.maxitem):
|
|
|
|
break
|
|
|
|
|
|
|
|
if output == "phtml":
|
|
|
|
print("<ul>")
|
|
|
|
for item in itemlist:
|
|
|
|
i = i + 1
|
|
|
|
# Keep consistent datetime representation if not use allitem[item[1]]['updated']
|
2024-03-09 17:22:30 +00:00
|
|
|
link = allitem[item[1]]["link"]
|
|
|
|
title = html.escape(allitem[item[1]]["title"])
|
|
|
|
timestamp = datetime.datetime.fromtimestamp(
|
|
|
|
allitem[item[1]]["epoch"]
|
|
|
|
).ctime()
|
|
|
|
print(f'<li><a href="{link}"> {title}</a> --- (<i>{timestamp}</i>)</li>')
|
2024-02-11 10:17:49 +00:00
|
|
|
if i == int(options.maxitem):
|
|
|
|
break
|
|
|
|
print("</ul>")
|
|
|
|
|
|
|
|
if output == "markdown":
|
|
|
|
for item in itemlist:
|
|
|
|
i = i + 1
|
2024-03-09 17:22:30 +00:00
|
|
|
title = html.escape(allitem[item[1]]["title"])
|
|
|
|
link = allitem[item[1]]["link"]
|
|
|
|
timestamp = datetime.datetime.fromtimestamp(
|
|
|
|
allitem[item[1]]["epoch"]
|
|
|
|
).ctime()
|
|
|
|
domain = urlparse(allitem[item[1]]["link"]).netloc
|
|
|
|
print(f'- {domain} [{title}]({link}) @{timestamp}')
|
2024-02-11 10:17:49 +00:00
|
|
|
if i == int(options.maxitem):
|
|
|
|
break
|
2013-04-14 12:05:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
usage = "usage: %prog [options] url"
|
|
|
|
parser = OptionParser(usage)
|
|
|
|
|
2024-02-11 10:17:49 +00:00
|
|
|
parser.add_option(
|
|
|
|
"-m",
|
|
|
|
"--maxitem",
|
|
|
|
dest="maxitem",
|
2024-02-11 10:54:31 +00:00
|
|
|
default=200,
|
2024-02-11 10:17:49 +00:00
|
|
|
help="maximum item to list in the feed, default 200",
|
|
|
|
)
|
2024-02-11 10:54:31 +00:00
|
|
|
parser.add_option(
|
|
|
|
"-s",
|
|
|
|
"--summarysize",
|
|
|
|
dest="summarysize",
|
|
|
|
default=60,
|
|
|
|
help="maximum size of the summary if a title is not present",
|
|
|
|
)
|
2024-02-11 10:17:49 +00:00
|
|
|
parser.add_option(
|
|
|
|
"-o",
|
|
|
|
"--output",
|
|
|
|
dest="output",
|
2024-02-11 10:54:31 +00:00
|
|
|
default="text",
|
2024-02-11 10:17:49 +00:00
|
|
|
help="output format (text, phtml, markdown), default text",
|
|
|
|
)
|
|
|
|
|
|
|
|
# 2007-11-10 11:25:51
|
|
|
|
pattern = "%Y-%m-%d %H:%M:%S"
|
2013-04-14 12:05:30 +00:00
|
|
|
|
|
|
|
(options, args) = parser.parse_args()
|
|
|
|
|
|
|
|
allitem = {}
|
|
|
|
|
|
|
|
for url in args:
|
2024-02-11 10:17:49 +00:00
|
|
|
d = feedparser.parse(url)
|
|
|
|
|
|
|
|
for el in d.entries:
|
|
|
|
if "modified_parsed" in el:
|
|
|
|
eldatetime = datetime.datetime.fromtimestamp(
|
|
|
|
time.mktime(el.modified_parsed)
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
eldatetime = datetime.datetime.fromtimestamp(
|
|
|
|
time.mktime(el.published_parsed)
|
|
|
|
)
|
|
|
|
elepoch = int(time.mktime(time.strptime(str(eldatetime), pattern)))
|
|
|
|
h = hashlib.md5()
|
|
|
|
h.update(el.link.encode("utf-8"))
|
|
|
|
linkkey = h.hexdigest()
|
|
|
|
allitem[linkkey] = {}
|
|
|
|
allitem[linkkey]["link"] = str(el.link)
|
|
|
|
allitem[linkkey]["epoch"] = int(elepoch)
|
|
|
|
allitem[linkkey]["updated"] = el.updated
|
2024-02-11 10:54:31 +00:00
|
|
|
if "title" in el:
|
|
|
|
allitem[linkkey]["title"] = html.unescape(el.title)
|
|
|
|
else:
|
|
|
|
cleantext = BeautifulSoup(el.summary, "lxml").text
|
|
|
|
allitem[linkkey]["title"] = cleantext[: options.summarysize]
|
|
|
|
|
2013-04-14 12:05:30 +00:00
|
|
|
itemlist = []
|
|
|
|
|
2024-02-11 10:17:49 +00:00
|
|
|
for something in list(allitem.keys()):
|
|
|
|
epochkeytuple = (allitem[something]["epoch"], something)
|
|
|
|
itemlist.append(epochkeytuple)
|
2013-04-14 12:05:30 +00:00
|
|
|
|
|
|
|
itemlist.sort()
|
|
|
|
itemlist.reverse()
|
|
|
|
|
2024-02-11 10:17:49 +00:00
|
|
|
RenderMerge(itemlist, options.output)
|