From 4f263946929f43c36ad62af966a523a19e78bdc4 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Mon, 4 Mar 2024 11:14:28 +0100 Subject: [PATCH] chg: [rssfind] added a brute-force mode `-b` to discover potential feed source --- README.md | 11 +++++++- bin/rssfind.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 79 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f0ffe3d..f655a81 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,14 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen ### rssfind -[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script for discovering RSS or Atom feeds from a URL. It returns an array in JSON format of all the potential feeds discovered. +[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL. + +It employs two techniques: + +- The first involves searching for direct link references to the feed within the HTML page. +- The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds. + +The script returns an array in JSON format containing all the potential feeds it discovers. ~~~shell Usage: Find RSS or Atom feeds from an URL @@ -28,6 +35,8 @@ Options: -l LINK, --link=LINK http link where to find one or more feed source(s) -d, --disable-strict Include empty feeds in the list, default strict is enabled + -b, --brute-force Search RSS/Atom feeds by brute-forcing url path + (useful if the page is missing a link entry) ~~~ ### rsscluster diff --git a/bin/rssfind.py b/bin/rssfind.py index ce03b0b..8d528f1 100644 --- a/bin/rssfind.py +++ b/bin/rssfind.py @@ -1,14 +1,41 @@ #!/usr/bin/python3 +# [rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL. +# +# It employs two techniques: +# +# - The first involves searching for direct link references to the feed within the HTML page. +# - The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds. +# +# The script returns an array in JSON format containing all the potential feeds it discovers. import sys import urllib.parse from optparse import OptionParser +import random import feedparser import orjson as json import requests from bs4 import BeautifulSoup as bs4 +brute_force_urls = [ + "index.xml", + "feed/index.php", + "feed.xml", + "feed.atom", + "feed.rss", + "feed.json", + "feed.php", + "feed.asp", + "posts.rss", + "blog.xml", + "atom.xml", + "podcasts.xml", + "main.atom", + "main.xml", +] +random.shuffle(brute_force_urls) + def findfeeds(url=None, disable_strict=False): if url is None: @@ -50,13 +77,34 @@ def findfeeds(url=None, disable_strict=False): return results +def brutefindfeeds(url=None, disable_strict=False): + if url is None: + return None + found_urls = [] + found_valid_feeds = [] + parsed_url = urllib.parse.urlparse(url) + for path in brute_force_urls: + url = f"{parsed_url.scheme}://{parsed_url.hostname}/{path}" + r = requests.get(url, headers=headers) + if r.status_code == 200: + found_urls.append(url) + for url in list(set(found_urls)): + f = feedparser.parse(url) + if f.entries: + if url not in found_valid_feeds: + found_valid_feeds.append(url) + if disable_strict: + return list(set(found_urls)) + else: + return found_valid_feeds + + version = "0.2" user_agent = f"rssfind.py {version} +https://github.com/adulau/rss-tools" feedparser.USER_AGENT = user_agent - headers = {"User-Agent": user_agent} usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]" @@ -78,15 +126,30 @@ parser.add_option( help="Include empty feeds in the list, default strict is enabled", ) +parser.add_option( + "-b", + "--brute-force", + action="store_true", + default=False, + help="Search RSS/Atom feeds by brute-forcing url path (useful if the page is missing a link entry)", +) + (options, args) = parser.parse_args() if not options.link: - print("URL missing") + print("Link/url missing - -l option") parser.print_help() sys.exit(0) -print( - json.dumps(findfeeds(options.link, disable_strict=options.disable_strict)).decode( - "utf-8" +if not options.brute_force: + print( + json.dumps( + findfeeds(url=options.link, disable_strict=options.disable_strict) + ).decode("utf-8") + ) +else: + print( + json.dumps( + brutefindfeeds(url=options.link, disable_strict=options.disable_strict) + ).decode("utf-8") ) -)