From 848b96a03c8b19668bf2cd78a269bd46256e240c Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy <a@foo.be>
Date: Sun, 3 Mar 2024 21:49:55 +0100
Subject: [PATCH 1/3] new: [rssfind.py] a simple script to discover RSS/Atom
 feeds from an URL

---
 README.md      | 15 +++++++++
 REQUIREMENTS   |  2 ++
 bin/rssfind.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 106 insertions(+)
 create mode 100644 bin/rssfind.py

diff --git a/README.md b/README.md
index 89c51d4..f0ffe3d 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,21 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen
 
 ## Tools
 
+### rssfind
+
+[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script for discovering RSS or Atom feeds from a URL. It returns an array in JSON format of all the potential feeds discovered.
+
+~~~shell
+Usage: Find RSS or Atom feeds from an URL
+usage: rssfind.py [options]
+
+Options:
+  -h, --help            show this help message and exit
+  -l LINK, --link=LINK  http link where to find one or more feed source(s)
+  -d, --disable-strict  Include empty feeds in the list, default strict is
+                        enabled
+~~~
+
 ### rsscluster
 
 [rsscluster.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script that clusters items from an RSS feed based on a specified time interval, expressed in days.
diff --git a/REQUIREMENTS b/REQUIREMENTS
index f57478b..7aaddcd 100644
--- a/REQUIREMENTS
+++ b/REQUIREMENTS
@@ -1,2 +1,4 @@
 bs4
 feedparser
+orjson
+requests
diff --git a/bin/rssfind.py b/bin/rssfind.py
new file mode 100644
index 0000000..c25b6f6
--- /dev/null
+++ b/bin/rssfind.py
@@ -0,0 +1,89 @@
+#!/usr/bin/python3
+
+import sys
+import urllib.parse
+from optparse import OptionParser
+
+import feedparser
+import orjson as json
+import requests
+from bs4 import BeautifulSoup as bs4
+
+
+def findfeeds(url=None, disable_strict=False):
+    if url is None:
+        return None
+
+    raw = requests.get(url).text
+    results = []
+    discovered_feeds = []
+    html = bs4(raw, features="lxml")
+    feed_urls = html.findAll("link", rel="alternate")
+    if feed_urls:
+        for f in feed_urls:
+            tag = f.get("type", None)
+            if tag:
+                if "feed" in tag or "rss" in tag or "xml" in tag:
+                    href = f.get("href", None)
+                    if href:
+                        discovered_feeds.append(href)
+
+    parsed_url = urllib.parse.urlparse(url)
+    base = f"{parsed_url.scheme}://{parsed_url.hostname}"
+    ahreftags = html.findAll("a")
+
+    for a in ahreftags:
+        href = a.get("href", None)
+        if href:
+            if "feed" in href or "rss" in href or "xml" in href:
+                discovered_feeds.append(f"{base}{href}")
+
+    for url in list(set(discovered_feeds)):
+        f = feedparser.parse(url)
+        if f.entries:
+            if url not in results:
+                results.append(url)
+
+    if disable_strict:
+        return list(set(discovered_feeds))
+    else:
+        return results
+
+
+version = "0.2"
+
+feedparser.USER_AGENT = (
+    "rssfind.py " + version + " +https://github.com/adulau/rss-tools"
+)
+
+usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"
+
+parser = OptionParser(usage)
+
+parser.add_option(
+    "-l",
+    "--link",
+    dest="link",
+    help="http link where to find one or more feed source(s)",
+)
+
+parser.add_option(
+    "-d",
+    "--disable-strict",
+    action="store_false",
+    default=False,
+    help="Include empty feeds in the list, default strict is enabled",
+)
+
+(options, args) = parser.parse_args()
+
+if not options.link:
+    print("URL missing")
+    parser.print_help()
+    sys.exit(0)
+
+print(
+    json.dumps(findfeeds(options.link, disable_strict=options.disable_strict)).decode(
+        "utf-8"
+    )
+)

From 149c6b4489d34bcfb53bea8e872066386ef823ee Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy <a@foo.be>
Date: Sun, 3 Mar 2024 22:07:18 +0100
Subject: [PATCH 2/3] chg: [rssfind] set coherent `User-Agent` headers

---
 bin/rssfind.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/bin/rssfind.py b/bin/rssfind.py
index c25b6f6..ce03b0b 100644
--- a/bin/rssfind.py
+++ b/bin/rssfind.py
@@ -14,7 +14,7 @@ def findfeeds(url=None, disable_strict=False):
     if url is None:
         return None
 
-    raw = requests.get(url).text
+    raw = requests.get(url, headers=headers).text
     results = []
     discovered_feeds = []
     html = bs4(raw, features="lxml")
@@ -52,9 +52,12 @@ def findfeeds(url=None, disable_strict=False):
 
 version = "0.2"
 
-feedparser.USER_AGENT = (
-    "rssfind.py " + version + " +https://github.com/adulau/rss-tools"
-)
+user_agent = f"rssfind.py {version} +https://github.com/adulau/rss-tools"
+
+feedparser.USER_AGENT = user_agent
+
+
+headers = {"User-Agent": user_agent}
 
 usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"
 

From 4f263946929f43c36ad62af966a523a19e78bdc4 Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy <a@foo.be>
Date: Mon, 4 Mar 2024 11:14:28 +0100
Subject: [PATCH 3/3] chg: [rssfind] added a brute-force mode `-b` to discover
 potential feed source

---
 README.md      | 11 +++++++-
 bin/rssfind.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index f0ffe3d..f655a81 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,14 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen
 
 ### rssfind
 
-[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script for discovering RSS or Atom feeds from a URL. It returns an array in JSON format of all the potential feeds discovered.
+[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL.
+
+It employs two techniques:
+
+- The first involves searching for direct link references to the feed within the HTML page.
+- The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds.
+
+The script returns an array in JSON format containing all the potential feeds it discovers.
 
 ~~~shell
 Usage: Find RSS or Atom feeds from an URL
@@ -28,6 +35,8 @@ Options:
   -l LINK, --link=LINK  http link where to find one or more feed source(s)
   -d, --disable-strict  Include empty feeds in the list, default strict is
                         enabled
+  -b, --brute-force     Search RSS/Atom feeds by brute-forcing url path
+                        (useful if the page is missing a link entry)
 ~~~
 
 ### rsscluster
diff --git a/bin/rssfind.py b/bin/rssfind.py
index ce03b0b..8d528f1 100644
--- a/bin/rssfind.py
+++ b/bin/rssfind.py
@@ -1,14 +1,41 @@
 #!/usr/bin/python3
+# [rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL.
+#
+# It employs two techniques:
+#
+# - The first involves searching for direct link references to the feed within the HTML page.
+# - The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds.
+#
+# The script returns an array in JSON format containing all the potential feeds it discovers.
 
 import sys
 import urllib.parse
 from optparse import OptionParser
+import random
 
 import feedparser
 import orjson as json
 import requests
 from bs4 import BeautifulSoup as bs4
 
+brute_force_urls = [
+    "index.xml",
+    "feed/index.php",
+    "feed.xml",
+    "feed.atom",
+    "feed.rss",
+    "feed.json",
+    "feed.php",
+    "feed.asp",
+    "posts.rss",
+    "blog.xml",
+    "atom.xml",
+    "podcasts.xml",
+    "main.atom",
+    "main.xml",
+]
+random.shuffle(brute_force_urls)
+
 
 def findfeeds(url=None, disable_strict=False):
     if url is None:
@@ -50,13 +77,34 @@ def findfeeds(url=None, disable_strict=False):
         return results
 
 
+def brutefindfeeds(url=None, disable_strict=False):
+    if url is None:
+        return None
+    found_urls = []
+    found_valid_feeds = []
+    parsed_url = urllib.parse.urlparse(url)
+    for path in brute_force_urls:
+        url = f"{parsed_url.scheme}://{parsed_url.hostname}/{path}"
+        r = requests.get(url, headers=headers)
+        if r.status_code == 200:
+            found_urls.append(url)
+    for url in list(set(found_urls)):
+        f = feedparser.parse(url)
+        if f.entries:
+            if url not in found_valid_feeds:
+                found_valid_feeds.append(url)
+    if disable_strict:
+        return list(set(found_urls))
+    else:
+        return found_valid_feeds
+
+
 version = "0.2"
 
 user_agent = f"rssfind.py {version} +https://github.com/adulau/rss-tools"
 
 feedparser.USER_AGENT = user_agent
 
-
 headers = {"User-Agent": user_agent}
 
 usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"
@@ -78,15 +126,30 @@ parser.add_option(
     help="Include empty feeds in the list, default strict is enabled",
 )
 
+parser.add_option(
+    "-b",
+    "--brute-force",
+    action="store_true",
+    default=False,
+    help="Search RSS/Atom feeds by brute-forcing url path (useful if the page is missing a link entry)",
+)
+
 (options, args) = parser.parse_args()
 
 if not options.link:
-    print("URL missing")
+    print("Link/url missing - -l option")
     parser.print_help()
     sys.exit(0)
 
-print(
-    json.dumps(findfeeds(options.link, disable_strict=options.disable_strict)).decode(
-        "utf-8"
+if not options.brute_force:
+    print(
+        json.dumps(
+            findfeeds(url=options.link, disable_strict=options.disable_strict)
+        ).decode("utf-8")
+    )
+else:
+    print(
+        json.dumps(
+            brutefindfeeds(url=options.link, disable_strict=options.disable_strict)
+        ).decode("utf-8")
     )
-)