2024-03-03 20:49:55 +00:00
#!/usr/bin/python3
2024-03-04 10:14:28 +00:00
# [rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL.
#
# It employs two techniques:
#
# - The first involves searching for direct link references to the feed within the HTML page.
# - The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds.
#
# The script returns an array in JSON format containing all the potential feeds it discovers.
2024-03-03 20:49:55 +00:00
import sys
import urllib . parse
from optparse import OptionParser
2024-03-04 10:14:28 +00:00
import random
2024-03-03 20:49:55 +00:00
import feedparser
import orjson as json
import requests
from bs4 import BeautifulSoup as bs4
2024-03-04 10:14:28 +00:00
brute_force_urls = [
" index.xml " ,
" feed/index.php " ,
" feed.xml " ,
" feed.atom " ,
" feed.rss " ,
" feed.json " ,
" feed.php " ,
" feed.asp " ,
" posts.rss " ,
" blog.xml " ,
" atom.xml " ,
" podcasts.xml " ,
" main.atom " ,
" main.xml " ,
]
random . shuffle ( brute_force_urls )
2024-03-03 20:49:55 +00:00
def findfeeds ( url = None , disable_strict = False ) :
if url is None :
return None
2024-03-03 21:07:18 +00:00
raw = requests . get ( url , headers = headers ) . text
2024-03-03 20:49:55 +00:00
results = [ ]
discovered_feeds = [ ]
html = bs4 ( raw , features = " lxml " )
feed_urls = html . findAll ( " link " , rel = " alternate " )
if feed_urls :
for f in feed_urls :
tag = f . get ( " type " , None )
if tag :
if " feed " in tag or " rss " in tag or " xml " in tag :
href = f . get ( " href " , None )
if href :
discovered_feeds . append ( href )
parsed_url = urllib . parse . urlparse ( url )
base = f " { parsed_url . scheme } :// { parsed_url . hostname } "
ahreftags = html . findAll ( " a " )
for a in ahreftags :
href = a . get ( " href " , None )
if href :
if " feed " in href or " rss " in href or " xml " in href :
discovered_feeds . append ( f " { base } { href } " )
for url in list ( set ( discovered_feeds ) ) :
f = feedparser . parse ( url )
if f . entries :
if url not in results :
results . append ( url )
if disable_strict :
return list ( set ( discovered_feeds ) )
else :
return results
2024-03-04 10:14:28 +00:00
def brutefindfeeds ( url = None , disable_strict = False ) :
if url is None :
return None
found_urls = [ ]
found_valid_feeds = [ ]
parsed_url = urllib . parse . urlparse ( url )
for path in brute_force_urls :
url = f " { parsed_url . scheme } :// { parsed_url . hostname } / { path } "
r = requests . get ( url , headers = headers )
if r . status_code == 200 :
found_urls . append ( url )
for url in list ( set ( found_urls ) ) :
f = feedparser . parse ( url )
if f . entries :
if url not in found_valid_feeds :
found_valid_feeds . append ( url )
if disable_strict :
return list ( set ( found_urls ) )
else :
return found_valid_feeds
2024-03-03 20:49:55 +00:00
version = " 0.2 "
2024-03-03 21:07:18 +00:00
user_agent = f " rssfind.py { version } +https://github.com/adulau/rss-tools "
feedparser . USER_AGENT = user_agent
headers = { " User-Agent " : user_agent }
2024-03-03 20:49:55 +00:00
usage = " Find RSS or Atom feeds from an URL \n usage: % prog [options] "
parser = OptionParser ( usage )
parser . add_option (
" -l " ,
" --link " ,
dest = " link " ,
help = " http link where to find one or more feed source(s) " ,
)
parser . add_option (
" -d " ,
" --disable-strict " ,
action = " store_false " ,
default = False ,
help = " Include empty feeds in the list, default strict is enabled " ,
)
2024-03-04 10:14:28 +00:00
parser . add_option (
" -b " ,
" --brute-force " ,
action = " store_true " ,
default = False ,
help = " Search RSS/Atom feeds by brute-forcing url path (useful if the page is missing a link entry) " ,
)
2024-03-03 20:49:55 +00:00
( options , args ) = parser . parse_args ( )
if not options . link :
2024-03-04 10:14:28 +00:00
print ( " Link/url missing - -l option " )
2024-03-03 20:49:55 +00:00
parser . print_help ( )
sys . exit ( 0 )
2024-03-04 10:14:28 +00:00
if not options . brute_force :
print (
json . dumps (
findfeeds ( url = options . link , disable_strict = options . disable_strict )
) . decode ( " utf-8 " )
)
else :
print (
json . dumps (
brutefindfeeds ( url = options . link , disable_strict = options . disable_strict )
) . decode ( " utf-8 " )
2024-03-03 20:49:55 +00:00
)