new: [domainclassifier] add a simple cache of the TLDs list from IANA

(to avoid downloading at each start of the library)
This commit is contained in:
Alexandre Dulaunoy 2022-07-30 16:14:21 +02:00
parent 8debd6c6b7
commit 1e55e0a5a7
Signed by: adulau
GPG key ID: 09E2CD4944E6CBCD

View file

@ -10,6 +10,7 @@ import IPy
import socket import socket
import time import time
from datetime import date, timedelta from datetime import date, timedelta
import os
try: try:
# python 3 # python 3
@ -82,14 +83,28 @@ class Extract:
) )
return value['response']['ranking']['rank'] return value['response']['ranking']['rank']
def __updatelisttld(self): def __updatelisttld(self, force=False):
ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt" ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
req = urllib.Request(ianatldlist) userdir = os.path.expanduser("~")
req.add_header( cachedir = os.path.join(userdir, ".DomainClassifier")
'User-Agent', if not os.path.exists(cachedir):
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0', os.mkdir(cachedir)
) tldcache = os.path.join(cachedir, "tlds")
tlds = (urllib.urlopen(req).read()).decode('utf8') if not os.path.exists(tldcache):
print(tldcache)
req = urllib.Request(ianatldlist)
req.add_header(
'User-Agent',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0',
)
tlds = (urllib.urlopen(req).read()).decode('utf8')
f = open(tldcache, "wb")
f.write(tlds.encode("utf-8"))
f.close()
f = open(tldcache, "r")
tlds = f.read()
f.close()
tlds = tlds.split("\n") tlds = tlds.split("\n")
for tld in tlds: for tld in tlds:
self.listtld.append(tld.lower()) self.listtld.append(tld.lower())