Merge branch 'master' of github.com:adulau/rss-tools

2024-12-22 00:15:57 +00:00 · 2024-03-04 15:03:44 +01:00 · 2024-03-04 15:03:44 +01:00 · 779f49d143
commit 779f49d143
parent 7cbd0d037d 4f26394692
3 changed files with 181 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -15,6 +15,30 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen
 ## Tools
 ### rssfind
 [rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL.
 It employs two techniques:
 - The first involves searching for direct link references to the feed within the HTML page.
 - The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds.
 The script returns an array in JSON format containing all the potential feeds it discovers.
 ~~~shell
 Usage: Find RSS or Atom feeds from an URL
 usage: rssfind.py [options]
 Options:
  -h, --help            show this help message and exit
  -l LINK, --link=LINK  http link where to find one or more feed source(s)
  -d, --disable-strict  Include empty feeds in the list, default strict is
                        enabled
  -b, --brute-force     Search RSS/Atom feeds by brute-forcing url path
                        (useful if the page is missing a link entry)
 ~~~
 ### rsscluster
 [rsscluster.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script that clusters items from an RSS feed based on a specified time interval, expressed in days.
--- a/2
+++ b/2
@ -2,3 +2,5 @@ arrow
 bs4
 feedparser
 feedgen
 orjson
 requests
--- a/bin/rssfind.py
+++ b/bin/rssfind.py
@ -0,0 +1,155 @@
 #!/usr/bin/python3
 # [rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL.
 #
 # It employs two techniques:
 #
 # - The first involves searching for direct link references to the feed within the HTML page.
 # - The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds.
 #
 # The script returns an array in JSON format containing all the potential feeds it discovers.
 import sys
 import urllib.parse
 from optparse import OptionParser
 import random
 import feedparser
 import orjson as json
 import requests
 from bs4 import BeautifulSoup as bs4
 brute_force_urls = [
    "index.xml",
    "feed/index.php",
    "feed.xml",
    "feed.atom",
    "feed.rss",
    "feed.json",
    "feed.php",
    "feed.asp",
    "posts.rss",
    "blog.xml",
    "atom.xml",
    "podcasts.xml",
    "main.atom",
    "main.xml",
 ]
 random.shuffle(brute_force_urls)
 def findfeeds(url=None, disable_strict=False):
    if url is None:
        return None
    raw = requests.get(url, headers=headers).text
    results = []
    discovered_feeds = []
    html = bs4(raw, features="lxml")
    feed_urls = html.findAll("link", rel="alternate")
    if feed_urls:
        for f in feed_urls:
            tag = f.get("type", None)
            if tag:
                if "feed" in tag or "rss" in tag or "xml" in tag:
                    href = f.get("href", None)
                    if href:
                        discovered_feeds.append(href)
    parsed_url = urllib.parse.urlparse(url)
    base = f"{parsed_url.scheme}://{parsed_url.hostname}"
    ahreftags = html.findAll("a")
    for a in ahreftags:
        href = a.get("href", None)
        if href:
            if "feed" in href or "rss" in href or "xml" in href:
                discovered_feeds.append(f"{base}{href}")
    for url in list(set(discovered_feeds)):
        f = feedparser.parse(url)
        if f.entries:
            if url not in results:
                results.append(url)
    if disable_strict:
        return list(set(discovered_feeds))
    else:
        return results
 def brutefindfeeds(url=None, disable_strict=False):
    if url is None:
        return None
    found_urls = []
    found_valid_feeds = []
    parsed_url = urllib.parse.urlparse(url)
    for path in brute_force_urls:
        url = f"{parsed_url.scheme}://{parsed_url.hostname}/{path}"
        r = requests.get(url, headers=headers)
        if r.status_code == 200:
            found_urls.append(url)
    for url in list(set(found_urls)):
        f = feedparser.parse(url)
        if f.entries:
            if url not in found_valid_feeds:
                found_valid_feeds.append(url)
    if disable_strict:
        return list(set(found_urls))
    else:
        return found_valid_feeds
 version = "0.2"
 user_agent = f"rssfind.py {version} +https://github.com/adulau/rss-tools"
 feedparser.USER_AGENT = user_agent
 headers = {"User-Agent": user_agent}
 usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"
 parser = OptionParser(usage)
 parser.add_option(
    "-l",
    "--link",
    dest="link",
    help="http link where to find one or more feed source(s)",
 )
 parser.add_option(
    "-d",
    "--disable-strict",
    action="store_false",
    default=False,
    help="Include empty feeds in the list, default strict is enabled",
 )
 parser.add_option(
    "-b",
    "--brute-force",
    action="store_true",
    default=False,
    help="Search RSS/Atom feeds by brute-forcing url path (useful if the page is missing a link entry)",
 )
 (options, args) = parser.parse_args()
 if not options.link:
    print("Link/url missing - -l option")
    parser.print_help()
    sys.exit(0)
 if not options.brute_force:
    print(
        json.dumps(
            findfeeds(url=options.link, disable_strict=options.disable_strict)
        ).decode("utf-8")
    )
 else:
    print(
        json.dumps(
            brutefindfeeds(url=options.link, disable_strict=options.disable_strict)
        ).decode("utf-8")
    )