mirror of
https://github.com/adulau/rss-tools.git
synced 2024-11-21 17:47:07 +00:00
chg: [rssfind] added a brute-force mode -b
to discover potential feed source
This commit is contained in:
parent
149c6b4489
commit
4f26394692
2 changed files with 79 additions and 7 deletions
11
README.md
11
README.md
|
@ -17,7 +17,14 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen
|
||||||
|
|
||||||
### rssfind
|
### rssfind
|
||||||
|
|
||||||
[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script for discovering RSS or Atom feeds from a URL. It returns an array in JSON format of all the potential feeds discovered.
|
[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL.
|
||||||
|
|
||||||
|
It employs two techniques:
|
||||||
|
|
||||||
|
- The first involves searching for direct link references to the feed within the HTML page.
|
||||||
|
- The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds.
|
||||||
|
|
||||||
|
The script returns an array in JSON format containing all the potential feeds it discovers.
|
||||||
|
|
||||||
~~~shell
|
~~~shell
|
||||||
Usage: Find RSS or Atom feeds from an URL
|
Usage: Find RSS or Atom feeds from an URL
|
||||||
|
@ -28,6 +35,8 @@ Options:
|
||||||
-l LINK, --link=LINK http link where to find one or more feed source(s)
|
-l LINK, --link=LINK http link where to find one or more feed source(s)
|
||||||
-d, --disable-strict Include empty feeds in the list, default strict is
|
-d, --disable-strict Include empty feeds in the list, default strict is
|
||||||
enabled
|
enabled
|
||||||
|
-b, --brute-force Search RSS/Atom feeds by brute-forcing url path
|
||||||
|
(useful if the page is missing a link entry)
|
||||||
~~~
|
~~~
|
||||||
|
|
||||||
### rsscluster
|
### rsscluster
|
||||||
|
|
|
@ -1,14 +1,41 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
# [rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL.
|
||||||
|
#
|
||||||
|
# It employs two techniques:
|
||||||
|
#
|
||||||
|
# - The first involves searching for direct link references to the feed within the HTML page.
|
||||||
|
# - The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds.
|
||||||
|
#
|
||||||
|
# The script returns an array in JSON format containing all the potential feeds it discovers.
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
|
import random
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
import orjson as json
|
import orjson as json
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup as bs4
|
from bs4 import BeautifulSoup as bs4
|
||||||
|
|
||||||
|
brute_force_urls = [
|
||||||
|
"index.xml",
|
||||||
|
"feed/index.php",
|
||||||
|
"feed.xml",
|
||||||
|
"feed.atom",
|
||||||
|
"feed.rss",
|
||||||
|
"feed.json",
|
||||||
|
"feed.php",
|
||||||
|
"feed.asp",
|
||||||
|
"posts.rss",
|
||||||
|
"blog.xml",
|
||||||
|
"atom.xml",
|
||||||
|
"podcasts.xml",
|
||||||
|
"main.atom",
|
||||||
|
"main.xml",
|
||||||
|
]
|
||||||
|
random.shuffle(brute_force_urls)
|
||||||
|
|
||||||
|
|
||||||
def findfeeds(url=None, disable_strict=False):
|
def findfeeds(url=None, disable_strict=False):
|
||||||
if url is None:
|
if url is None:
|
||||||
|
@ -50,13 +77,34 @@ def findfeeds(url=None, disable_strict=False):
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def brutefindfeeds(url=None, disable_strict=False):
|
||||||
|
if url is None:
|
||||||
|
return None
|
||||||
|
found_urls = []
|
||||||
|
found_valid_feeds = []
|
||||||
|
parsed_url = urllib.parse.urlparse(url)
|
||||||
|
for path in brute_force_urls:
|
||||||
|
url = f"{parsed_url.scheme}://{parsed_url.hostname}/{path}"
|
||||||
|
r = requests.get(url, headers=headers)
|
||||||
|
if r.status_code == 200:
|
||||||
|
found_urls.append(url)
|
||||||
|
for url in list(set(found_urls)):
|
||||||
|
f = feedparser.parse(url)
|
||||||
|
if f.entries:
|
||||||
|
if url not in found_valid_feeds:
|
||||||
|
found_valid_feeds.append(url)
|
||||||
|
if disable_strict:
|
||||||
|
return list(set(found_urls))
|
||||||
|
else:
|
||||||
|
return found_valid_feeds
|
||||||
|
|
||||||
|
|
||||||
version = "0.2"
|
version = "0.2"
|
||||||
|
|
||||||
user_agent = f"rssfind.py {version} +https://github.com/adulau/rss-tools"
|
user_agent = f"rssfind.py {version} +https://github.com/adulau/rss-tools"
|
||||||
|
|
||||||
feedparser.USER_AGENT = user_agent
|
feedparser.USER_AGENT = user_agent
|
||||||
|
|
||||||
|
|
||||||
headers = {"User-Agent": user_agent}
|
headers = {"User-Agent": user_agent}
|
||||||
|
|
||||||
usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"
|
usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"
|
||||||
|
@ -78,15 +126,30 @@ parser.add_option(
|
||||||
help="Include empty feeds in the list, default strict is enabled",
|
help="Include empty feeds in the list, default strict is enabled",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_option(
|
||||||
|
"-b",
|
||||||
|
"--brute-force",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Search RSS/Atom feeds by brute-forcing url path (useful if the page is missing a link entry)",
|
||||||
|
)
|
||||||
|
|
||||||
(options, args) = parser.parse_args()
|
(options, args) = parser.parse_args()
|
||||||
|
|
||||||
if not options.link:
|
if not options.link:
|
||||||
print("URL missing")
|
print("Link/url missing - -l option")
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
|
if not options.brute_force:
|
||||||
print(
|
print(
|
||||||
json.dumps(findfeeds(options.link, disable_strict=options.disable_strict)).decode(
|
json.dumps(
|
||||||
"utf-8"
|
findfeeds(url=options.link, disable_strict=options.disable_strict)
|
||||||
|
).decode("utf-8")
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
json.dumps(
|
||||||
|
brutefindfeeds(url=options.link, disable_strict=options.disable_strict)
|
||||||
|
).decode("utf-8")
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue