mirror of
https://github.com/adulau/rss-tools.git
synced 2024-12-21 16:05:57 +00:00
Merge branch 'master' of github.com:adulau/rss-tools
This commit is contained in:
commit
779f49d143
3 changed files with 181 additions and 0 deletions
24
README.md
24
README.md
|
@ -15,6 +15,30 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen
|
|||
|
||||
## Tools
|
||||
|
||||
### rssfind
|
||||
|
||||
[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL.
|
||||
|
||||
It employs two techniques:
|
||||
|
||||
- The first involves searching for direct link references to the feed within the HTML page.
|
||||
- The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds.
|
||||
|
||||
The script returns an array in JSON format containing all the potential feeds it discovers.
|
||||
|
||||
~~~shell
|
||||
Usage: Find RSS or Atom feeds from an URL
|
||||
usage: rssfind.py [options]
|
||||
|
||||
Options:
|
||||
-h, --help show this help message and exit
|
||||
-l LINK, --link=LINK http link where to find one or more feed source(s)
|
||||
-d, --disable-strict Include empty feeds in the list, default strict is
|
||||
enabled
|
||||
-b, --brute-force Search RSS/Atom feeds by brute-forcing url path
|
||||
(useful if the page is missing a link entry)
|
||||
~~~
|
||||
|
||||
### rsscluster
|
||||
|
||||
[rsscluster.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script that clusters items from an RSS feed based on a specified time interval, expressed in days.
|
||||
|
|
|
@ -2,3 +2,5 @@ arrow
|
|||
bs4
|
||||
feedparser
|
||||
feedgen
|
||||
orjson
|
||||
requests
|
||||
|
|
155
bin/rssfind.py
Normal file
155
bin/rssfind.py
Normal file
|
@ -0,0 +1,155 @@
|
|||
#!/usr/bin/python3
|
||||
# [rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL.
|
||||
#
|
||||
# It employs two techniques:
|
||||
#
|
||||
# - The first involves searching for direct link references to the feed within the HTML page.
|
||||
# - The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds.
|
||||
#
|
||||
# The script returns an array in JSON format containing all the potential feeds it discovers.
|
||||
|
||||
import sys
|
||||
import urllib.parse
|
||||
from optparse import OptionParser
|
||||
import random
|
||||
|
||||
import feedparser
|
||||
import orjson as json
|
||||
import requests
|
||||
from bs4 import BeautifulSoup as bs4
|
||||
|
||||
brute_force_urls = [
|
||||
"index.xml",
|
||||
"feed/index.php",
|
||||
"feed.xml",
|
||||
"feed.atom",
|
||||
"feed.rss",
|
||||
"feed.json",
|
||||
"feed.php",
|
||||
"feed.asp",
|
||||
"posts.rss",
|
||||
"blog.xml",
|
||||
"atom.xml",
|
||||
"podcasts.xml",
|
||||
"main.atom",
|
||||
"main.xml",
|
||||
]
|
||||
random.shuffle(brute_force_urls)
|
||||
|
||||
|
||||
def findfeeds(url=None, disable_strict=False):
|
||||
if url is None:
|
||||
return None
|
||||
|
||||
raw = requests.get(url, headers=headers).text
|
||||
results = []
|
||||
discovered_feeds = []
|
||||
html = bs4(raw, features="lxml")
|
||||
feed_urls = html.findAll("link", rel="alternate")
|
||||
if feed_urls:
|
||||
for f in feed_urls:
|
||||
tag = f.get("type", None)
|
||||
if tag:
|
||||
if "feed" in tag or "rss" in tag or "xml" in tag:
|
||||
href = f.get("href", None)
|
||||
if href:
|
||||
discovered_feeds.append(href)
|
||||
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
base = f"{parsed_url.scheme}://{parsed_url.hostname}"
|
||||
ahreftags = html.findAll("a")
|
||||
|
||||
for a in ahreftags:
|
||||
href = a.get("href", None)
|
||||
if href:
|
||||
if "feed" in href or "rss" in href or "xml" in href:
|
||||
discovered_feeds.append(f"{base}{href}")
|
||||
|
||||
for url in list(set(discovered_feeds)):
|
||||
f = feedparser.parse(url)
|
||||
if f.entries:
|
||||
if url not in results:
|
||||
results.append(url)
|
||||
|
||||
if disable_strict:
|
||||
return list(set(discovered_feeds))
|
||||
else:
|
||||
return results
|
||||
|
||||
|
||||
def brutefindfeeds(url=None, disable_strict=False):
|
||||
if url is None:
|
||||
return None
|
||||
found_urls = []
|
||||
found_valid_feeds = []
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
for path in brute_force_urls:
|
||||
url = f"{parsed_url.scheme}://{parsed_url.hostname}/{path}"
|
||||
r = requests.get(url, headers=headers)
|
||||
if r.status_code == 200:
|
||||
found_urls.append(url)
|
||||
for url in list(set(found_urls)):
|
||||
f = feedparser.parse(url)
|
||||
if f.entries:
|
||||
if url not in found_valid_feeds:
|
||||
found_valid_feeds.append(url)
|
||||
if disable_strict:
|
||||
return list(set(found_urls))
|
||||
else:
|
||||
return found_valid_feeds
|
||||
|
||||
|
||||
version = "0.2"
|
||||
|
||||
user_agent = f"rssfind.py {version} +https://github.com/adulau/rss-tools"
|
||||
|
||||
feedparser.USER_AGENT = user_agent
|
||||
|
||||
headers = {"User-Agent": user_agent}
|
||||
|
||||
usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"
|
||||
|
||||
parser = OptionParser(usage)
|
||||
|
||||
parser.add_option(
|
||||
"-l",
|
||||
"--link",
|
||||
dest="link",
|
||||
help="http link where to find one or more feed source(s)",
|
||||
)
|
||||
|
||||
parser.add_option(
|
||||
"-d",
|
||||
"--disable-strict",
|
||||
action="store_false",
|
||||
default=False,
|
||||
help="Include empty feeds in the list, default strict is enabled",
|
||||
)
|
||||
|
||||
parser.add_option(
|
||||
"-b",
|
||||
"--brute-force",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Search RSS/Atom feeds by brute-forcing url path (useful if the page is missing a link entry)",
|
||||
)
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
if not options.link:
|
||||
print("Link/url missing - -l option")
|
||||
parser.print_help()
|
||||
sys.exit(0)
|
||||
|
||||
if not options.brute_force:
|
||||
print(
|
||||
json.dumps(
|
||||
findfeeds(url=options.link, disable_strict=options.disable_strict)
|
||||
).decode("utf-8")
|
||||
)
|
||||
else:
|
||||
print(
|
||||
json.dumps(
|
||||
brutefindfeeds(url=options.link, disable_strict=options.disable_strict)
|
||||
).decode("utf-8")
|
||||
)
|
Loading…
Reference in a new issue