chg: [rssfind] added a brute-force mode -b to discover potential feed source

2024-11-21 17:47:07 +00:00 · 2024-03-04 11:14:28 +01:00 · 2024-03-04 11:14:28 +01:00 · 4f26394692
commit 4f26394692
parent 149c6b4489
2 changed files with 79 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -17,7 +17,14 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen
 ### rssfind
-[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script for discovering RSS or Atom feeds from a URL. It returns an array in JSON format of all the potential feeds discovered.
+[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL.
 It employs two techniques:
 - The first involves searching for direct link references to the feed within the HTML page.
 - The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds.
 The script returns an array in JSON format containing all the potential feeds it discovers.
 ~~~shell
 Usage: Find RSS or Atom feeds from an URL
@ -28,6 +35,8 @@ Options:
  -l LINK, --link=LINK  http link where to find one or more feed source(s)
  -d, --disable-strict  Include empty feeds in the list, default strict is
                        enabled
  -b, --brute-force     Search RSS/Atom feeds by brute-forcing url path
                        (useful if the page is missing a link entry)
 ~~~
 ### rsscluster
--- a/bin/rssfind.py
+++ b/bin/rssfind.py
@ -1,14 +1,41 @@
 #!/usr/bin/python3
 # [rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL.
 #
 # It employs two techniques:
 #
 # - The first involves searching for direct link references to the feed within the HTML page.
 # - The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds.
 #
 # The script returns an array in JSON format containing all the potential feeds it discovers.
 import sys
 import urllib.parse
 from optparse import OptionParser
 import random
 import feedparser
 import orjson as json
 import requests
 from bs4 import BeautifulSoup as bs4
 brute_force_urls = [
    "index.xml",
    "feed/index.php",
    "feed.xml",
    "feed.atom",
    "feed.rss",
    "feed.json",
    "feed.php",
    "feed.asp",
    "posts.rss",
    "blog.xml",
    "atom.xml",
    "podcasts.xml",
    "main.atom",
    "main.xml",
 ]
 random.shuffle(brute_force_urls)
 def findfeeds(url=None, disable_strict=False):
    if url is None:
@ -50,13 +77,34 @@ def findfeeds(url=None, disable_strict=False):
        return results
 def brutefindfeeds(url=None, disable_strict=False):
    if url is None:
        return None
    found_urls = []
    found_valid_feeds = []
    parsed_url = urllib.parse.urlparse(url)
    for path in brute_force_urls:
        url = f"{parsed_url.scheme}://{parsed_url.hostname}/{path}"
        r = requests.get(url, headers=headers)
        if r.status_code == 200:
            found_urls.append(url)
    for url in list(set(found_urls)):
        f = feedparser.parse(url)
        if f.entries:
            if url not in found_valid_feeds:
                found_valid_feeds.append(url)
    if disable_strict:
        return list(set(found_urls))
    else:
        return found_valid_feeds
 version = "0.2"
 user_agent = f"rssfind.py {version} +https://github.com/adulau/rss-tools"
 feedparser.USER_AGENT = user_agent
 headers = {"User-Agent": user_agent}
 usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"
@ -78,15 +126,30 @@ parser.add_option(
    help="Include empty feeds in the list, default strict is enabled",
 )
 parser.add_option(
    "-b",
    "--brute-force",
    action="store_true",
    default=False,
    help="Search RSS/Atom feeds by brute-forcing url path (useful if the page is missing a link entry)",
 )
 (options, args) = parser.parse_args()
 if not options.link:
-    print("URL missing")
+    print("Link/url missing - -l option")
    parser.print_help()
    sys.exit(0)
 if not options.brute_force:
    print(
-    json.dumps(findfeeds(options.link, disable_strict=options.disable_strict)).decode(
+        json.dumps(
-        "utf-8"
+            findfeeds(url=options.link, disable_strict=options.disable_strict)
        ).decode("utf-8")
    )
 else:
    print(
        json.dumps(
            brutefindfeeds(url=options.link, disable_strict=options.disable_strict)
        ).decode("utf-8")
    )