From 848b96a03c8b19668bf2cd78a269bd46256e240c Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Sun, 3 Mar 2024 21:49:55 +0100 Subject: [PATCH] new: [rssfind.py] a simple script to discover RSS/Atom feeds from an URL --- README.md | 15 +++++++++ REQUIREMENTS | 2 ++ bin/rssfind.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+) create mode 100644 bin/rssfind.py diff --git a/README.md b/README.md index 89c51d4..f0ffe3d 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,21 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen ## Tools +### rssfind + +[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script for discovering RSS or Atom feeds from a URL. It returns an array in JSON format of all the potential feeds discovered. + +~~~shell +Usage: Find RSS or Atom feeds from an URL +usage: rssfind.py [options] + +Options: + -h, --help show this help message and exit + -l LINK, --link=LINK http link where to find one or more feed source(s) + -d, --disable-strict Include empty feeds in the list, default strict is + enabled +~~~ + ### rsscluster [rsscluster.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script that clusters items from an RSS feed based on a specified time interval, expressed in days. diff --git a/REQUIREMENTS b/REQUIREMENTS index f57478b..7aaddcd 100644 --- a/REQUIREMENTS +++ b/REQUIREMENTS @@ -1,2 +1,4 @@ bs4 feedparser +orjson +requests diff --git a/bin/rssfind.py b/bin/rssfind.py new file mode 100644 index 0000000..c25b6f6 --- /dev/null +++ b/bin/rssfind.py @@ -0,0 +1,89 @@ +#!/usr/bin/python3 + +import sys +import urllib.parse +from optparse import OptionParser + +import feedparser +import orjson as json +import requests +from bs4 import BeautifulSoup as bs4 + + +def findfeeds(url=None, disable_strict=False): + if url is None: + return None + + raw = requests.get(url).text + results = [] + discovered_feeds = [] + html = bs4(raw, features="lxml") + feed_urls = html.findAll("link", rel="alternate") + if feed_urls: + for f in feed_urls: + tag = f.get("type", None) + if tag: + if "feed" in tag or "rss" in tag or "xml" in tag: + href = f.get("href", None) + if href: + discovered_feeds.append(href) + + parsed_url = urllib.parse.urlparse(url) + base = f"{parsed_url.scheme}://{parsed_url.hostname}" + ahreftags = html.findAll("a") + + for a in ahreftags: + href = a.get("href", None) + if href: + if "feed" in href or "rss" in href or "xml" in href: + discovered_feeds.append(f"{base}{href}") + + for url in list(set(discovered_feeds)): + f = feedparser.parse(url) + if f.entries: + if url not in results: + results.append(url) + + if disable_strict: + return list(set(discovered_feeds)) + else: + return results + + +version = "0.2" + +feedparser.USER_AGENT = ( + "rssfind.py " + version + " +https://github.com/adulau/rss-tools" +) + +usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]" + +parser = OptionParser(usage) + +parser.add_option( + "-l", + "--link", + dest="link", + help="http link where to find one or more feed source(s)", +) + +parser.add_option( + "-d", + "--disable-strict", + action="store_false", + default=False, + help="Include empty feeds in the list, default strict is enabled", +) + +(options, args) = parser.parse_args() + +if not options.link: + print("URL missing") + parser.print_help() + sys.exit(0) + +print( + json.dumps(findfeeds(options.link, disable_strict=options.disable_strict)).decode( + "utf-8" + ) +)