new: [rssfind.py] a simple script to discover RSS/Atom feeds from an URL

2024-11-21 17:47:07 +00:00 · 2024-03-03 21:49:55 +01:00 · 2024-03-03 21:49:55 +01:00 · 848b96a03c
commit 848b96a03c
parent 243ac1d233
3 changed files with 106 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -15,6 +15,21 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen
 ## Tools
 ### rssfind
 [rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script for discovering RSS or Atom feeds from a URL. It returns an array in JSON format of all the potential feeds discovered.
 ~~~shell
 Usage: Find RSS or Atom feeds from an URL
 usage: rssfind.py [options]
 Options:
  -h, --help            show this help message and exit
  -l LINK, --link=LINK  http link where to find one or more feed source(s)
  -d, --disable-strict  Include empty feeds in the list, default strict is
                        enabled
 ~~~
 ### rsscluster
 [rsscluster.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script that clusters items from an RSS feed based on a specified time interval, expressed in days.
--- a/2
+++ b/2
@ -1,2 +1,4 @@
 bs4
 feedparser
 orjson
 requests
--- a/bin/rssfind.py
+++ b/bin/rssfind.py
@ -0,0 +1,89 @@
 #!/usr/bin/python3
 import sys
 import urllib.parse
 from optparse import OptionParser
 import feedparser
 import orjson as json
 import requests
 from bs4 import BeautifulSoup as bs4
 def findfeeds(url=None, disable_strict=False):
    if url is None:
        return None
    raw = requests.get(url).text
    results = []
    discovered_feeds = []
    html = bs4(raw, features="lxml")
    feed_urls = html.findAll("link", rel="alternate")
    if feed_urls:
        for f in feed_urls:
            tag = f.get("type", None)
            if tag:
                if "feed" in tag or "rss" in tag or "xml" in tag:
                    href = f.get("href", None)
                    if href:
                        discovered_feeds.append(href)
    parsed_url = urllib.parse.urlparse(url)
    base = f"{parsed_url.scheme}://{parsed_url.hostname}"
    ahreftags = html.findAll("a")
    for a in ahreftags:
        href = a.get("href", None)
        if href:
            if "feed" in href or "rss" in href or "xml" in href:
                discovered_feeds.append(f"{base}{href}")
    for url in list(set(discovered_feeds)):
        f = feedparser.parse(url)
        if f.entries:
            if url not in results:
                results.append(url)
    if disable_strict:
        return list(set(discovered_feeds))
    else:
        return results
 version = "0.2"
 feedparser.USER_AGENT = (
    "rssfind.py " + version + " +https://github.com/adulau/rss-tools"
 )
 usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"
 parser = OptionParser(usage)
 parser.add_option(
    "-l",
    "--link",
    dest="link",
    help="http link where to find one or more feed source(s)",
 )
 parser.add_option(
    "-d",
    "--disable-strict",
    action="store_false",
    default=False,
    help="Include empty feeds in the list, default strict is enabled",
 )
 (options, args) = parser.parse_args()
 if not options.link:
    print("URL missing")
    parser.print_help()
    sys.exit(0)
 print(
    json.dumps(findfeeds(options.link, disable_strict=options.disable_strict)).decode(
        "utf-8"
    )
 )