From 115c48f65c69b0cab29cd5c879c201fe4fe9ad6c Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy
Date: Sat, 30 Jul 2022 15:44:23 +0200
Subject: [PATCH 1/5] fix: [DomainClassifier] set optional dns port
---
DomainClassifier/domainclassifier.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py
index 1ca5e8f..7d5a5ba 100644
--- a/DomainClassifier/domainclassifier.py
+++ b/DomainClassifier/domainclassifier.py
@@ -38,7 +38,7 @@ class Extract:
self.rawtext = rawtext
self.presolver = dns.resolver.Resolver()
self.presolver.nameservers = nameservers
- self.presolver.port = 53
+ self.presolver.port = port
self.presolver.lifetime = 1.0
self.bgprankingserver = 'pdns.circl.lu'
self.vdomain = []
From 8debd6c6b79211db3945895e768055c56ca3fdc3 Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy
Date: Sat, 30 Jul 2022 15:51:06 +0200
Subject: [PATCH 2/5] chg: [domainclassifier] clean-up code
---
DomainClassifier/domainclassifier.py | 105 ++++++++++++++++++---------
1 file changed, 69 insertions(+), 36 deletions(-)
diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py
index 7d5a5ba..e0a603e 100644
--- a/DomainClassifier/domainclassifier.py
+++ b/DomainClassifier/domainclassifier.py
@@ -12,16 +12,16 @@ import time
from datetime import date, timedelta
try:
- #python 3
+ # python 3
import urllib.request as urllib
except:
- #python 2
+ # python 2
import urllib2 as urllib
try:
- from pybgpranking import BGPRanking
+ from pybgpranking import BGPRanking
except:
- print ("pybgpranking is not installed - ranking of ASN values won't be possible")
+ print("pybgpranking is not installed - ranking of ASN values won't be possible")
__author__ = "Alexandre Dulaunoy"
__copyright__ = "Copyright 2012-2021, Alexandre Dulaunoy"
__license__ = "AGPL version 3"
@@ -34,7 +34,7 @@ class Extract:
from a rawtext stream. When call, the rawtext parameter is a string
containing the raw data to be process."""
- def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port= 53):
+ def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port=53):
self.rawtext = rawtext
self.presolver = dns.resolver.Resolver()
self.presolver.nameservers = nameservers
@@ -52,7 +52,11 @@ class Extract:
def __origin(self, ipaddr=None):
if ipaddr:
- clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.', '.origin.asn.cymru.com')
+ clook = (
+ IPy.IP(str(ipaddr))
+ .reverseName()
+ .replace('.in-addr.arpa.', '.origin.asn.cymru.com')
+ )
try:
a = self.presolver.query(clook, 'TXT')
except dns.resolver.NXDOMAIN:
@@ -62,23 +66,30 @@ class Extract:
if a:
x = str(a[0]).split("|")
# why so many spaces?
- x = list( map(lambda t: t.replace("\"", "").strip(), x) )
+ x = list(map(lambda t: t.replace("\"", "").strip(), x))
return (x[0], x[2], a[0])
else:
return None
+
"""__bgpanking return the ranking the float value of an ASN.
"""
+
def __bgpranking(self, asn=None):
if asn:
bgpranking = BGPRanking()
- value = bgpranking.query(asn, date=(date.today() - timedelta(1)).isoformat())
+ value = bgpranking.query(
+ asn, date=(date.today() - timedelta(1)).isoformat()
+ )
return value['response']['ranking']['rank']
def __updatelisttld(self):
ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
req = urllib.Request(ianatldlist)
- req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0')
- tlds = ( urllib.urlopen(req).read() ).decode('utf8')
+ req.add_header(
+ 'User-Agent',
+ 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0',
+ )
+ tlds = (urllib.urlopen(req).read()).decode('utf8')
tlds = tlds.split("\n")
for tld in tlds:
self.listtld.append(tld.lower())
@@ -104,10 +115,12 @@ class Extract:
self.vdomain = []
return True
return False
+
"""potentialdomain method extracts potential domains matching any
string that is a serie of string with maximun 63 character separated by a
dot. The method used the rawtext defined at the instantiation of the class.
This return a list of a potential domain."""
+
def potentialdomain(self, validTLD=True):
self.domain = []
domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
@@ -124,7 +137,12 @@ class Extract:
returns a list of existing domain. If the extended flag is true, a set is
return with the associated DNS resources found."""
- def validdomain(self, rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], extended=True, passive_dns=False):
+ def validdomain(
+ self,
+ rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'],
+ extended=True,
+ passive_dns=False,
+ ):
if extended is False:
self.vdomain = set()
else:
@@ -143,7 +161,17 @@ class Extract:
rrset = answers.rrset.to_text().splitlines()
for dns_resp in rrset:
dns_resp = dns_resp.split()
- passive_dns_out = '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(time.time(), self.presolver.nameservers[0], dns_resp[2], domain, dnstype, dns_resp[4], answers.ttl)
+ passive_dns_out = (
+ '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(
+ time.time(),
+ self.presolver.nameservers[0],
+ dns_resp[2],
+ domain,
+ dnstype,
+ dns_resp[4],
+ answers.ttl,
+ )
+ )
self.vdomain.add((passive_dns_out))
elif extended:
self.vdomain.append((domain, dnstype, answers[0]))
@@ -188,7 +216,7 @@ class Extract:
orig = self.__origin(ipaddr=dom[2])[1]
except:
continue
- if(orig == cc):
+ if orig == cc:
self.localdom.append(dom)
elif dom[1] == 'CNAME':
cname = str(dom[2])
@@ -197,7 +225,7 @@ class Extract:
orig = self.__origin(ipaddr=ip)[1]
except:
continue
- if(orig == cc):
+ if orig == cc:
self.localdom.append(dom)
return self.localdom
@@ -276,32 +304,37 @@ class Extract:
if type(dom) == tuple:
dom = dom[0]
if includefilter.search(dom):
- self.cleandomain.append(dom)
+ self.cleandomain.append(dom)
return set(self.cleandomain)
+
if __name__ == "__main__":
- c = Extract(rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist")
- c.text(rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be")
- print (c.potentialdomain())
- print (c.potentialdomain(validTLD=True))
- print (c.validdomain(extended=True))
- print ("US:")
- print (c.localizedomain(cc='US'))
- print ("LU:")
- print (c.localizedomain(cc='LU'))
- print ("BE:")
- print (c.localizedomain(cc='BE'))
- print ("Ranking:")
- print (c.rankdomain())
- print ("List of ip addresses:")
- print (c.ipaddress(extended=False))
- print ("Include dot.lu:")
- print (c.include(expression=r'\.lu$'))
- print ("Exclude dot.lu:")
- print (c.exclude(expression=r'\.lu$'))
+ c = Extract(
+ rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist"
+ )
+ c.text(
+ rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be"
+ )
+ print(c.potentialdomain())
+ print(c.potentialdomain(validTLD=True))
+ print(c.validdomain(extended=True))
+ print("US:")
+ print(c.localizedomain(cc='US'))
+ print("LU:")
+ print(c.localizedomain(cc='LU'))
+ print("BE:")
+ print(c.localizedomain(cc='BE'))
+ print("Ranking:")
+ print(c.rankdomain())
+ print("List of ip addresses:")
+ print(c.ipaddress(extended=False))
+ print("Include dot.lu:")
+ print(c.include(expression=r'\.lu$'))
+ print("Exclude dot.lu:")
+ print(c.exclude(expression=r'\.lu$'))
c.text(rawtext="www.lwn.net www.undeadly.org")
- print (c.potentialdomain(validTLD=True))
+ print(c.potentialdomain(validTLD=True))
c.validdomain()
- print (c.localizedomain(cc='US'))
+ print(c.localizedomain(cc='US'))
print(c.validdomain(extended=False, passive_dns=True))
From 1e55e0a5a7a573c0da4ca565695b1507eb2cd464 Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy
Date: Sat, 30 Jul 2022 16:14:21 +0200
Subject: [PATCH 3/5] new: [domainclassifier] add a simple cache of the TLDs
list from IANA (to avoid downloading at each start of the library)
---
DomainClassifier/domainclassifier.py | 29 +++++++++++++++++++++-------
1 file changed, 22 insertions(+), 7 deletions(-)
diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py
index e0a603e..40ff813 100644
--- a/DomainClassifier/domainclassifier.py
+++ b/DomainClassifier/domainclassifier.py
@@ -10,6 +10,7 @@ import IPy
import socket
import time
from datetime import date, timedelta
+import os
try:
# python 3
@@ -82,14 +83,28 @@ class Extract:
)
return value['response']['ranking']['rank']
- def __updatelisttld(self):
+ def __updatelisttld(self, force=False):
ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
- req = urllib.Request(ianatldlist)
- req.add_header(
- 'User-Agent',
- 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0',
- )
- tlds = (urllib.urlopen(req).read()).decode('utf8')
+ userdir = os.path.expanduser("~")
+ cachedir = os.path.join(userdir, ".DomainClassifier")
+ if not os.path.exists(cachedir):
+ os.mkdir(cachedir)
+ tldcache = os.path.join(cachedir, "tlds")
+ if not os.path.exists(tldcache):
+ print(tldcache)
+ req = urllib.Request(ianatldlist)
+ req.add_header(
+ 'User-Agent',
+ 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0',
+ )
+ tlds = (urllib.urlopen(req).read()).decode('utf8')
+ f = open(tldcache, "wb")
+ f.write(tlds.encode("utf-8"))
+ f.close()
+
+ f = open(tldcache, "r")
+ tlds = f.read()
+ f.close()
tlds = tlds.split("\n")
for tld in tlds:
self.listtld.append(tld.lower())
From c769bba999334a6e66f7ee1f88aa8513aac55db7 Mon Sep 17 00:00:00 2001
From: terrtia
Date: Tue, 9 Jan 2024 15:02:22 +0100
Subject: [PATCH 4/5] chg: [domainclassifier] update req user-agent
---
DomainClassifier/domainclassifier.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py
index 40ff813..e072e5c 100644
--- a/DomainClassifier/domainclassifier.py
+++ b/DomainClassifier/domainclassifier.py
@@ -95,7 +95,7 @@ class Extract:
req = urllib.Request(ianatldlist)
req.add_header(
'User-Agent',
- 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0',
+ 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0',
)
tlds = (urllib.urlopen(req).read()).decode('utf8')
f = open(tldcache, "wb")
From 75a546bc3e39aeb58f8eba6981a15c5e552114ac Mon Sep 17 00:00:00 2001
From: terrtia
Date: Wed, 10 Jan 2024 14:28:00 +0100
Subject: [PATCH 5/5] chg: [domclassifier] add dns records redis cache + regex
timeout
---
DomainClassifier/__init__.py | 2 +-
DomainClassifier/domainclassifier.py | 136 ++++++++++++++++++++-------
requirements.txt | 1 +
setup.py | 2 +-
4 files changed, 103 insertions(+), 38 deletions(-)
diff --git a/DomainClassifier/__init__.py b/DomainClassifier/__init__.py
index 4802e90..f901408 100644
--- a/DomainClassifier/__init__.py
+++ b/DomainClassifier/__init__.py
@@ -1 +1 @@
-__version__ = "1.0"
+__version__ = "1.1"
diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py
index e072e5c..92c7f38 100644
--- a/DomainClassifier/domainclassifier.py
+++ b/DomainClassifier/domainclassifier.py
@@ -7,10 +7,15 @@ attributes.
import re
import dns.resolver
import IPy
+import redis
import socket
import time
from datetime import date, timedelta
import os
+import sys
+from uuid import uuid4
+
+from multiprocessing import Process as Proc
try:
# python 3
@@ -24,9 +29,9 @@ try:
except:
print("pybgpranking is not installed - ranking of ASN values won't be possible")
__author__ = "Alexandre Dulaunoy"
-__copyright__ = "Copyright 2012-2021, Alexandre Dulaunoy"
+__copyright__ = "Copyright 2012-2024, Alexandre Dulaunoy"
__license__ = "AGPL version 3"
-__version__ = "0.9"
+__version__ = "1.1"
class Extract:
@@ -35,7 +40,7 @@ class Extract:
from a rawtext stream. When call, the rawtext parameter is a string
containing the raw data to be process."""
- def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port=53):
+ def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port=53, redis_host='', redis_port=6379, redis_db=0, expire_time=3600, re_timeout=-1):
self.rawtext = rawtext
self.presolver = dns.resolver.Resolver()
self.presolver.nameservers = nameservers
@@ -44,6 +49,17 @@ class Extract:
self.bgprankingserver = 'pdns.circl.lu'
self.vdomain = []
self.listtld = []
+
+ self.re_domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
+
+ if redis_host and redis_port:
+ self.redis = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_db, decode_responses=True)
+ self.uuid = str(uuid4())
+ self.re_timeout = re_timeout
+ else:
+ self.redis = None
+ self.expire_time = expire_time
+
self.domain = self.potentialdomain()
"""__origin is a private function to the ASN lookup for an IP address via
@@ -123,7 +139,32 @@ class Extract:
return self.cleandomain
- def text(self, rawtext=False):
+ def __re_findall(self, rawtext):
+ for x in re.findall(self.re_domain, rawtext):
+ if x[0]:
+ self.redis.sadd('cache:regex:{}'.format(self.uuid), x[0])
+ self.redis.expire('cache:regex:{}'.format(self.uuid), 360)
+
+ def __regex_findall(self, rawtext, timeout):
+ proc = Proc(target=self.__re_findall, args=(rawtext,))
+ try:
+ proc.start()
+ proc.join(timeout)
+ if proc.is_alive():
+ proc.terminate()
+ print('regex: processing timeout')
+ return []
+ else:
+ domains = self.redis.smembers('cache:regex:{}'.format(self.uuid))
+ self.redis.delete('cache:regex:{}'.format(self.uuid))
+ proc.terminate()
+ return domains
+ except KeyboardInterrupt:
+ print("Caught KeyboardInterrupt, terminating workers")
+ proc.terminate()
+ sys.exit(0)
+
+ def text(self, rawtext=''):
if rawtext:
self.rawtext = rawtext
self.domain = self.potentialdomain()
@@ -132,16 +173,19 @@ class Extract:
return False
"""potentialdomain method extracts potential domains matching any
- string that is a serie of string with maximun 63 character separated by a
+ string that is a serie of string with maximum 63 character separated by a
dot. The method used the rawtext defined at the instantiation of the class.
This return a list of a potential domain."""
def potentialdomain(self, validTLD=True):
self.domain = []
- domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
- for x in domain.findall(self.rawtext):
- if x[0]:
- self.domain.append(x[0])
+ if self.re_timeout > 0 and self.redis:
+ self.domain = list(self.__regex_findall(self.rawtext, self.re_timeout))
+ else:
+ domains = self.re_domain.findall(self.rawtext)
+ for x in domains:
+ if x[0]:
+ self.domain.append(x[0])
if validTLD:
self.domain = self.__listtld()
return self.domain
@@ -164,34 +208,54 @@ class Extract:
self.vdomain = []
for domain in self.domain:
- for dnstype in rtype:
- try:
- answers = self.presolver.query(domain, dnstype)
- except:
- pass
- else:
- # Pasive DNS output
- # timestamp||dns-client ||dns-server||RR class||Query||Query Type||Answer||TTL||Count
- if passive_dns:
- rrset = answers.rrset.to_text().splitlines()
- for dns_resp in rrset:
- dns_resp = dns_resp.split()
- passive_dns_out = (
- '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(
- time.time(),
- self.presolver.nameservers[0],
- dns_resp[2],
- domain,
- dnstype,
- dns_resp[4],
- answers.ttl,
- )
- )
- self.vdomain.add((passive_dns_out))
- elif extended:
- self.vdomain.append((domain, dnstype, answers[0]))
+ if self.redis:
+ if self.redis.exists('dom_class:cache:{}'.format(domain)):
+ passive_dns_out = self.redis.smembers('dom_class:cache:{}'.format(domain))
+ for out in passive_dns_out:
+ if extended:
+ out = tuple(out.split('[^]', 2))
+ self.vdomain.append(out)
+ else:
+ self.vdomain.add(out)
+ else:
+
+ for dnstype in rtype:
+ try:
+ answers = self.presolver.query(domain, dnstype)
+ except:
+ pass
else:
- self.vdomain.add((domain))
+ # Passive DNS output
+ # timestamp||dns-client ||dns-server||RR class||Query||Query Type||Answer||TTL||Count
+ if passive_dns:
+ rrset = answers.rrset.to_text().splitlines()
+ for dns_resp in rrset:
+ dns_resp = dns_resp.split()
+ passive_dns_out = (
+ '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(
+ time.time(),
+ self.presolver.nameservers[0],
+ dns_resp[2],
+ domain,
+ dnstype,
+ dns_resp[4],
+ answers.ttl,
+ )
+ )
+ self.vdomain.add((passive_dns_out))
+ if self.redis:
+ self.redis.sadd('dom_class:cache:{}'.format(domain), passive_dns_out)
+ self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time)
+ elif extended:
+ self.vdomain.append((domain, dnstype, answers[0]))
+ if self.redis:
+ self.redis.sadd('dom_class:cache:{}'.format(domain), '{}[^]{}[^]{}'.format(domain, dnstype, answers[0]))
+ self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time)
+ else:
+ self.vdomain.add((domain))
+ if self.redis:
+ self.redis.sadd('dom_class:cache:{}'.format(domain), domain)
+ self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time)
return self.vdomain
"""ipaddress method extracts from the domain list the valid IPv4 addresses"""
diff --git a/requirements.txt b/requirements.txt
index 2a1a022..8a2b9f4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
IPy
dnspython
git+https://github.com/D4-project/BGP-Ranking.git/#egg=pybgpranking&subdirectory=client
+redis
diff --git a/setup.py b/setup.py
index 4bbf58c..035305b 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages
setup(
name="DomainClassifier",
- version="1.0",
+ version="1.1",
packages=find_packages(),
install_requires=['dnspython', 'IPy', 'pybgpranking'],
dependency_links=[