mirror of
https://github.com/adulau/rss-tools.git
synced 2024-11-21 17:47:07 +00:00
new: [rssfind.py] a simple script to discover RSS/Atom feeds from an URL
This commit is contained in:
parent
243ac1d233
commit
848b96a03c
3 changed files with 106 additions and 0 deletions
15
README.md
15
README.md
|
@ -15,6 +15,21 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen
|
||||||
|
|
||||||
## Tools
|
## Tools
|
||||||
|
|
||||||
|
### rssfind
|
||||||
|
|
||||||
|
[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script for discovering RSS or Atom feeds from a URL. It returns an array in JSON format of all the potential feeds discovered.
|
||||||
|
|
||||||
|
~~~shell
|
||||||
|
Usage: Find RSS or Atom feeds from an URL
|
||||||
|
usage: rssfind.py [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
-l LINK, --link=LINK http link where to find one or more feed source(s)
|
||||||
|
-d, --disable-strict Include empty feeds in the list, default strict is
|
||||||
|
enabled
|
||||||
|
~~~
|
||||||
|
|
||||||
### rsscluster
|
### rsscluster
|
||||||
|
|
||||||
[rsscluster.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script that clusters items from an RSS feed based on a specified time interval, expressed in days.
|
[rsscluster.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script that clusters items from an RSS feed based on a specified time interval, expressed in days.
|
||||||
|
|
|
@ -1,2 +1,4 @@
|
||||||
bs4
|
bs4
|
||||||
feedparser
|
feedparser
|
||||||
|
orjson
|
||||||
|
requests
|
||||||
|
|
89
bin/rssfind.py
Normal file
89
bin/rssfind.py
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import urllib.parse
|
||||||
|
from optparse import OptionParser
|
||||||
|
|
||||||
|
import feedparser
|
||||||
|
import orjson as json
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup as bs4
|
||||||
|
|
||||||
|
|
||||||
|
def findfeeds(url=None, disable_strict=False):
|
||||||
|
if url is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
raw = requests.get(url).text
|
||||||
|
results = []
|
||||||
|
discovered_feeds = []
|
||||||
|
html = bs4(raw, features="lxml")
|
||||||
|
feed_urls = html.findAll("link", rel="alternate")
|
||||||
|
if feed_urls:
|
||||||
|
for f in feed_urls:
|
||||||
|
tag = f.get("type", None)
|
||||||
|
if tag:
|
||||||
|
if "feed" in tag or "rss" in tag or "xml" in tag:
|
||||||
|
href = f.get("href", None)
|
||||||
|
if href:
|
||||||
|
discovered_feeds.append(href)
|
||||||
|
|
||||||
|
parsed_url = urllib.parse.urlparse(url)
|
||||||
|
base = f"{parsed_url.scheme}://{parsed_url.hostname}"
|
||||||
|
ahreftags = html.findAll("a")
|
||||||
|
|
||||||
|
for a in ahreftags:
|
||||||
|
href = a.get("href", None)
|
||||||
|
if href:
|
||||||
|
if "feed" in href or "rss" in href or "xml" in href:
|
||||||
|
discovered_feeds.append(f"{base}{href}")
|
||||||
|
|
||||||
|
for url in list(set(discovered_feeds)):
|
||||||
|
f = feedparser.parse(url)
|
||||||
|
if f.entries:
|
||||||
|
if url not in results:
|
||||||
|
results.append(url)
|
||||||
|
|
||||||
|
if disable_strict:
|
||||||
|
return list(set(discovered_feeds))
|
||||||
|
else:
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
version = "0.2"
|
||||||
|
|
||||||
|
feedparser.USER_AGENT = (
|
||||||
|
"rssfind.py " + version + " +https://github.com/adulau/rss-tools"
|
||||||
|
)
|
||||||
|
|
||||||
|
usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"
|
||||||
|
|
||||||
|
parser = OptionParser(usage)
|
||||||
|
|
||||||
|
parser.add_option(
|
||||||
|
"-l",
|
||||||
|
"--link",
|
||||||
|
dest="link",
|
||||||
|
help="http link where to find one or more feed source(s)",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_option(
|
||||||
|
"-d",
|
||||||
|
"--disable-strict",
|
||||||
|
action="store_false",
|
||||||
|
default=False,
|
||||||
|
help="Include empty feeds in the list, default strict is enabled",
|
||||||
|
)
|
||||||
|
|
||||||
|
(options, args) = parser.parse_args()
|
||||||
|
|
||||||
|
if not options.link:
|
||||||
|
print("URL missing")
|
||||||
|
parser.print_help()
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
print(
|
||||||
|
json.dumps(findfeeds(options.link, disable_strict=options.disable_strict)).decode(
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
)
|
Loading…
Reference in a new issue