From 115c48f65c69b0cab29cd5c879c201fe4fe9ad6c Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Sat, 30 Jul 2022 15:44:23 +0200 Subject: [PATCH 1/5] fix: [DomainClassifier] set optional dns port --- DomainClassifier/domainclassifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py index 1ca5e8f..7d5a5ba 100644 --- a/DomainClassifier/domainclassifier.py +++ b/DomainClassifier/domainclassifier.py @@ -38,7 +38,7 @@ class Extract: self.rawtext = rawtext self.presolver = dns.resolver.Resolver() self.presolver.nameservers = nameservers - self.presolver.port = 53 + self.presolver.port = port self.presolver.lifetime = 1.0 self.bgprankingserver = 'pdns.circl.lu' self.vdomain = [] From 8debd6c6b79211db3945895e768055c56ca3fdc3 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Sat, 30 Jul 2022 15:51:06 +0200 Subject: [PATCH 2/5] chg: [domainclassifier] clean-up code --- DomainClassifier/domainclassifier.py | 105 ++++++++++++++++++--------- 1 file changed, 69 insertions(+), 36 deletions(-) diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py index 7d5a5ba..e0a603e 100644 --- a/DomainClassifier/domainclassifier.py +++ b/DomainClassifier/domainclassifier.py @@ -12,16 +12,16 @@ import time from datetime import date, timedelta try: - #python 3 + # python 3 import urllib.request as urllib except: - #python 2 + # python 2 import urllib2 as urllib try: - from pybgpranking import BGPRanking + from pybgpranking import BGPRanking except: - print ("pybgpranking is not installed - ranking of ASN values won't be possible") + print("pybgpranking is not installed - ranking of ASN values won't be possible") __author__ = "Alexandre Dulaunoy" __copyright__ = "Copyright 2012-2021, Alexandre Dulaunoy" __license__ = "AGPL version 3" @@ -34,7 +34,7 @@ class Extract: from a rawtext stream. When call, the rawtext parameter is a string containing the raw data to be process.""" - def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port= 53): + def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port=53): self.rawtext = rawtext self.presolver = dns.resolver.Resolver() self.presolver.nameservers = nameservers @@ -52,7 +52,11 @@ class Extract: def __origin(self, ipaddr=None): if ipaddr: - clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.', '.origin.asn.cymru.com') + clook = ( + IPy.IP(str(ipaddr)) + .reverseName() + .replace('.in-addr.arpa.', '.origin.asn.cymru.com') + ) try: a = self.presolver.query(clook, 'TXT') except dns.resolver.NXDOMAIN: @@ -62,23 +66,30 @@ class Extract: if a: x = str(a[0]).split("|") # why so many spaces? - x = list( map(lambda t: t.replace("\"", "").strip(), x) ) + x = list(map(lambda t: t.replace("\"", "").strip(), x)) return (x[0], x[2], a[0]) else: return None + """__bgpanking return the ranking the float value of an ASN. """ + def __bgpranking(self, asn=None): if asn: bgpranking = BGPRanking() - value = bgpranking.query(asn, date=(date.today() - timedelta(1)).isoformat()) + value = bgpranking.query( + asn, date=(date.today() - timedelta(1)).isoformat() + ) return value['response']['ranking']['rank'] def __updatelisttld(self): ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt" req = urllib.Request(ianatldlist) - req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0') - tlds = ( urllib.urlopen(req).read() ).decode('utf8') + req.add_header( + 'User-Agent', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0', + ) + tlds = (urllib.urlopen(req).read()).decode('utf8') tlds = tlds.split("\n") for tld in tlds: self.listtld.append(tld.lower()) @@ -104,10 +115,12 @@ class Extract: self.vdomain = [] return True return False + """potentialdomain method extracts potential domains matching any string that is a serie of string with maximun 63 character separated by a dot. The method used the rawtext defined at the instantiation of the class. This return a list of a potential domain.""" + def potentialdomain(self, validTLD=True): self.domain = [] domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b') @@ -124,7 +137,12 @@ class Extract: returns a list of existing domain. If the extended flag is true, a set is return with the associated DNS resources found.""" - def validdomain(self, rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], extended=True, passive_dns=False): + def validdomain( + self, + rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], + extended=True, + passive_dns=False, + ): if extended is False: self.vdomain = set() else: @@ -143,7 +161,17 @@ class Extract: rrset = answers.rrset.to_text().splitlines() for dns_resp in rrset: dns_resp = dns_resp.split() - passive_dns_out = '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(time.time(), self.presolver.nameservers[0], dns_resp[2], domain, dnstype, dns_resp[4], answers.ttl) + passive_dns_out = ( + '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format( + time.time(), + self.presolver.nameservers[0], + dns_resp[2], + domain, + dnstype, + dns_resp[4], + answers.ttl, + ) + ) self.vdomain.add((passive_dns_out)) elif extended: self.vdomain.append((domain, dnstype, answers[0])) @@ -188,7 +216,7 @@ class Extract: orig = self.__origin(ipaddr=dom[2])[1] except: continue - if(orig == cc): + if orig == cc: self.localdom.append(dom) elif dom[1] == 'CNAME': cname = str(dom[2]) @@ -197,7 +225,7 @@ class Extract: orig = self.__origin(ipaddr=ip)[1] except: continue - if(orig == cc): + if orig == cc: self.localdom.append(dom) return self.localdom @@ -276,32 +304,37 @@ class Extract: if type(dom) == tuple: dom = dom[0] if includefilter.search(dom): - self.cleandomain.append(dom) + self.cleandomain.append(dom) return set(self.cleandomain) + if __name__ == "__main__": - c = Extract(rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist") - c.text(rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be") - print (c.potentialdomain()) - print (c.potentialdomain(validTLD=True)) - print (c.validdomain(extended=True)) - print ("US:") - print (c.localizedomain(cc='US')) - print ("LU:") - print (c.localizedomain(cc='LU')) - print ("BE:") - print (c.localizedomain(cc='BE')) - print ("Ranking:") - print (c.rankdomain()) - print ("List of ip addresses:") - print (c.ipaddress(extended=False)) - print ("Include dot.lu:") - print (c.include(expression=r'\.lu$')) - print ("Exclude dot.lu:") - print (c.exclude(expression=r'\.lu$')) + c = Extract( + rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist" + ) + c.text( + rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be" + ) + print(c.potentialdomain()) + print(c.potentialdomain(validTLD=True)) + print(c.validdomain(extended=True)) + print("US:") + print(c.localizedomain(cc='US')) + print("LU:") + print(c.localizedomain(cc='LU')) + print("BE:") + print(c.localizedomain(cc='BE')) + print("Ranking:") + print(c.rankdomain()) + print("List of ip addresses:") + print(c.ipaddress(extended=False)) + print("Include dot.lu:") + print(c.include(expression=r'\.lu$')) + print("Exclude dot.lu:") + print(c.exclude(expression=r'\.lu$')) c.text(rawtext="www.lwn.net www.undeadly.org") - print (c.potentialdomain(validTLD=True)) + print(c.potentialdomain(validTLD=True)) c.validdomain() - print (c.localizedomain(cc='US')) + print(c.localizedomain(cc='US')) print(c.validdomain(extended=False, passive_dns=True)) From 1e55e0a5a7a573c0da4ca565695b1507eb2cd464 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Sat, 30 Jul 2022 16:14:21 +0200 Subject: [PATCH 3/5] new: [domainclassifier] add a simple cache of the TLDs list from IANA (to avoid downloading at each start of the library) --- DomainClassifier/domainclassifier.py | 29 +++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py index e0a603e..40ff813 100644 --- a/DomainClassifier/domainclassifier.py +++ b/DomainClassifier/domainclassifier.py @@ -10,6 +10,7 @@ import IPy import socket import time from datetime import date, timedelta +import os try: # python 3 @@ -82,14 +83,28 @@ class Extract: ) return value['response']['ranking']['rank'] - def __updatelisttld(self): + def __updatelisttld(self, force=False): ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt" - req = urllib.Request(ianatldlist) - req.add_header( - 'User-Agent', - 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0', - ) - tlds = (urllib.urlopen(req).read()).decode('utf8') + userdir = os.path.expanduser("~") + cachedir = os.path.join(userdir, ".DomainClassifier") + if not os.path.exists(cachedir): + os.mkdir(cachedir) + tldcache = os.path.join(cachedir, "tlds") + if not os.path.exists(tldcache): + print(tldcache) + req = urllib.Request(ianatldlist) + req.add_header( + 'User-Agent', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0', + ) + tlds = (urllib.urlopen(req).read()).decode('utf8') + f = open(tldcache, "wb") + f.write(tlds.encode("utf-8")) + f.close() + + f = open(tldcache, "r") + tlds = f.read() + f.close() tlds = tlds.split("\n") for tld in tlds: self.listtld.append(tld.lower()) From c769bba999334a6e66f7ee1f88aa8513aac55db7 Mon Sep 17 00:00:00 2001 From: terrtia Date: Tue, 9 Jan 2024 15:02:22 +0100 Subject: [PATCH 4/5] chg: [domainclassifier] update req user-agent --- DomainClassifier/domainclassifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py index 40ff813..e072e5c 100644 --- a/DomainClassifier/domainclassifier.py +++ b/DomainClassifier/domainclassifier.py @@ -95,7 +95,7 @@ class Extract: req = urllib.Request(ianatldlist) req.add_header( 'User-Agent', - 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0', ) tlds = (urllib.urlopen(req).read()).decode('utf8') f = open(tldcache, "wb") From 75a546bc3e39aeb58f8eba6981a15c5e552114ac Mon Sep 17 00:00:00 2001 From: terrtia Date: Wed, 10 Jan 2024 14:28:00 +0100 Subject: [PATCH 5/5] chg: [domclassifier] add dns records redis cache + regex timeout --- DomainClassifier/__init__.py | 2 +- DomainClassifier/domainclassifier.py | 136 ++++++++++++++++++++------- requirements.txt | 1 + setup.py | 2 +- 4 files changed, 103 insertions(+), 38 deletions(-) diff --git a/DomainClassifier/__init__.py b/DomainClassifier/__init__.py index 4802e90..f901408 100644 --- a/DomainClassifier/__init__.py +++ b/DomainClassifier/__init__.py @@ -1 +1 @@ -__version__ = "1.0" +__version__ = "1.1" diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py index e072e5c..92c7f38 100644 --- a/DomainClassifier/domainclassifier.py +++ b/DomainClassifier/domainclassifier.py @@ -7,10 +7,15 @@ attributes. import re import dns.resolver import IPy +import redis import socket import time from datetime import date, timedelta import os +import sys +from uuid import uuid4 + +from multiprocessing import Process as Proc try: # python 3 @@ -24,9 +29,9 @@ try: except: print("pybgpranking is not installed - ranking of ASN values won't be possible") __author__ = "Alexandre Dulaunoy" -__copyright__ = "Copyright 2012-2021, Alexandre Dulaunoy" +__copyright__ = "Copyright 2012-2024, Alexandre Dulaunoy" __license__ = "AGPL version 3" -__version__ = "0.9" +__version__ = "1.1" class Extract: @@ -35,7 +40,7 @@ class Extract: from a rawtext stream. When call, the rawtext parameter is a string containing the raw data to be process.""" - def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port=53): + def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port=53, redis_host='', redis_port=6379, redis_db=0, expire_time=3600, re_timeout=-1): self.rawtext = rawtext self.presolver = dns.resolver.Resolver() self.presolver.nameservers = nameservers @@ -44,6 +49,17 @@ class Extract: self.bgprankingserver = 'pdns.circl.lu' self.vdomain = [] self.listtld = [] + + self.re_domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b') + + if redis_host and redis_port: + self.redis = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_db, decode_responses=True) + self.uuid = str(uuid4()) + self.re_timeout = re_timeout + else: + self.redis = None + self.expire_time = expire_time + self.domain = self.potentialdomain() """__origin is a private function to the ASN lookup for an IP address via @@ -123,7 +139,32 @@ class Extract: return self.cleandomain - def text(self, rawtext=False): + def __re_findall(self, rawtext): + for x in re.findall(self.re_domain, rawtext): + if x[0]: + self.redis.sadd('cache:regex:{}'.format(self.uuid), x[0]) + self.redis.expire('cache:regex:{}'.format(self.uuid), 360) + + def __regex_findall(self, rawtext, timeout): + proc = Proc(target=self.__re_findall, args=(rawtext,)) + try: + proc.start() + proc.join(timeout) + if proc.is_alive(): + proc.terminate() + print('regex: processing timeout') + return [] + else: + domains = self.redis.smembers('cache:regex:{}'.format(self.uuid)) + self.redis.delete('cache:regex:{}'.format(self.uuid)) + proc.terminate() + return domains + except KeyboardInterrupt: + print("Caught KeyboardInterrupt, terminating workers") + proc.terminate() + sys.exit(0) + + def text(self, rawtext=''): if rawtext: self.rawtext = rawtext self.domain = self.potentialdomain() @@ -132,16 +173,19 @@ class Extract: return False """potentialdomain method extracts potential domains matching any - string that is a serie of string with maximun 63 character separated by a + string that is a serie of string with maximum 63 character separated by a dot. The method used the rawtext defined at the instantiation of the class. This return a list of a potential domain.""" def potentialdomain(self, validTLD=True): self.domain = [] - domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b') - for x in domain.findall(self.rawtext): - if x[0]: - self.domain.append(x[0]) + if self.re_timeout > 0 and self.redis: + self.domain = list(self.__regex_findall(self.rawtext, self.re_timeout)) + else: + domains = self.re_domain.findall(self.rawtext) + for x in domains: + if x[0]: + self.domain.append(x[0]) if validTLD: self.domain = self.__listtld() return self.domain @@ -164,34 +208,54 @@ class Extract: self.vdomain = [] for domain in self.domain: - for dnstype in rtype: - try: - answers = self.presolver.query(domain, dnstype) - except: - pass - else: - # Pasive DNS output - # timestamp||dns-client ||dns-server||RR class||Query||Query Type||Answer||TTL||Count - if passive_dns: - rrset = answers.rrset.to_text().splitlines() - for dns_resp in rrset: - dns_resp = dns_resp.split() - passive_dns_out = ( - '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format( - time.time(), - self.presolver.nameservers[0], - dns_resp[2], - domain, - dnstype, - dns_resp[4], - answers.ttl, - ) - ) - self.vdomain.add((passive_dns_out)) - elif extended: - self.vdomain.append((domain, dnstype, answers[0])) + if self.redis: + if self.redis.exists('dom_class:cache:{}'.format(domain)): + passive_dns_out = self.redis.smembers('dom_class:cache:{}'.format(domain)) + for out in passive_dns_out: + if extended: + out = tuple(out.split('[^]', 2)) + self.vdomain.append(out) + else: + self.vdomain.add(out) + else: + + for dnstype in rtype: + try: + answers = self.presolver.query(domain, dnstype) + except: + pass else: - self.vdomain.add((domain)) + # Passive DNS output + # timestamp||dns-client ||dns-server||RR class||Query||Query Type||Answer||TTL||Count + if passive_dns: + rrset = answers.rrset.to_text().splitlines() + for dns_resp in rrset: + dns_resp = dns_resp.split() + passive_dns_out = ( + '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format( + time.time(), + self.presolver.nameservers[0], + dns_resp[2], + domain, + dnstype, + dns_resp[4], + answers.ttl, + ) + ) + self.vdomain.add((passive_dns_out)) + if self.redis: + self.redis.sadd('dom_class:cache:{}'.format(domain), passive_dns_out) + self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time) + elif extended: + self.vdomain.append((domain, dnstype, answers[0])) + if self.redis: + self.redis.sadd('dom_class:cache:{}'.format(domain), '{}[^]{}[^]{}'.format(domain, dnstype, answers[0])) + self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time) + else: + self.vdomain.add((domain)) + if self.redis: + self.redis.sadd('dom_class:cache:{}'.format(domain), domain) + self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time) return self.vdomain """ipaddress method extracts from the domain list the valid IPv4 addresses""" diff --git a/requirements.txt b/requirements.txt index 2a1a022..8a2b9f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ IPy dnspython git+https://github.com/D4-project/BGP-Ranking.git/#egg=pybgpranking&subdirectory=client +redis diff --git a/setup.py b/setup.py index 4bbf58c..035305b 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup( name="DomainClassifier", - version="1.0", + version="1.1", packages=find_packages(), install_requires=['dnspython', 'IPy', 'pybgpranking'], dependency_links=[