From 8debd6c6b79211db3945895e768055c56ca3fdc3 Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy <a@foo.be>
Date: Sat, 30 Jul 2022 15:51:06 +0200
Subject: [PATCH] chg: [domainclassifier] clean-up code

---
 DomainClassifier/domainclassifier.py | 105 ++++++++++++++++++---------
 1 file changed, 69 insertions(+), 36 deletions(-)

diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py
index 7d5a5ba..e0a603e 100644
--- a/DomainClassifier/domainclassifier.py
+++ b/DomainClassifier/domainclassifier.py
@@ -12,16 +12,16 @@ import time
 from datetime import date, timedelta
 
 try:
-    #python 3
+    # python 3
     import urllib.request as urllib
 except:
-    #python 2
+    # python 2
     import urllib2 as urllib
 
 try:
-   from pybgpranking import BGPRanking
+    from pybgpranking import BGPRanking
 except:
-    print ("pybgpranking is not installed - ranking of ASN values won't be possible")
+    print("pybgpranking is not installed - ranking of ASN values won't be possible")
 __author__ = "Alexandre Dulaunoy"
 __copyright__ = "Copyright 2012-2021, Alexandre Dulaunoy"
 __license__ = "AGPL version 3"
@@ -34,7 +34,7 @@ class Extract:
     from a rawtext stream. When call, the rawtext parameter is a string
     containing the raw data to be process."""
 
-    def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port= 53):
+    def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port=53):
         self.rawtext = rawtext
         self.presolver = dns.resolver.Resolver()
         self.presolver.nameservers = nameservers
@@ -52,7 +52,11 @@ class Extract:
     def __origin(self, ipaddr=None):
 
         if ipaddr:
-            clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.', '.origin.asn.cymru.com')
+            clook = (
+                IPy.IP(str(ipaddr))
+                .reverseName()
+                .replace('.in-addr.arpa.', '.origin.asn.cymru.com')
+            )
             try:
                 a = self.presolver.query(clook, 'TXT')
             except dns.resolver.NXDOMAIN:
@@ -62,23 +66,30 @@ class Extract:
         if a:
             x = str(a[0]).split("|")
             # why so many spaces?
-            x = list( map(lambda t: t.replace("\"", "").strip(), x) )
+            x = list(map(lambda t: t.replace("\"", "").strip(), x))
             return (x[0], x[2], a[0])
         else:
             return None
+
     """__bgpanking return the ranking the float value of an ASN.
     """
+
     def __bgpranking(self, asn=None):
         if asn:
             bgpranking = BGPRanking()
-            value = bgpranking.query(asn, date=(date.today() - timedelta(1)).isoformat())
+            value = bgpranking.query(
+                asn, date=(date.today() - timedelta(1)).isoformat()
+            )
             return value['response']['ranking']['rank']
 
     def __updatelisttld(self):
         ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
         req = urllib.Request(ianatldlist)
-        req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0')
-        tlds = ( urllib.urlopen(req).read() ).decode('utf8')
+        req.add_header(
+            'User-Agent',
+            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0',
+        )
+        tlds = (urllib.urlopen(req).read()).decode('utf8')
         tlds = tlds.split("\n")
         for tld in tlds:
             self.listtld.append(tld.lower())
@@ -104,10 +115,12 @@ class Extract:
             self.vdomain = []
             return True
         return False
+
     """potentialdomain method extracts potential domains matching any
     string that is a serie of string with maximun 63 character separated by a
     dot. The method used the rawtext defined at the instantiation of the class.
     This return a list of a potential domain."""
+
     def potentialdomain(self, validTLD=True):
         self.domain = []
         domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
@@ -124,7 +137,12 @@ class Extract:
     returns a list of existing domain. If the extended flag is true, a set is
     return with the associated DNS resources found."""
 
-    def validdomain(self, rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], extended=True, passive_dns=False):
+    def validdomain(
+        self,
+        rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'],
+        extended=True,
+        passive_dns=False,
+    ):
         if extended is False:
             self.vdomain = set()
         else:
@@ -143,7 +161,17 @@ class Extract:
                         rrset = answers.rrset.to_text().splitlines()
                         for dns_resp in rrset:
                             dns_resp = dns_resp.split()
-                            passive_dns_out = '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(time.time(), self.presolver.nameservers[0], dns_resp[2], domain, dnstype, dns_resp[4], answers.ttl)
+                            passive_dns_out = (
+                                '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(
+                                    time.time(),
+                                    self.presolver.nameservers[0],
+                                    dns_resp[2],
+                                    domain,
+                                    dnstype,
+                                    dns_resp[4],
+                                    answers.ttl,
+                                )
+                            )
                             self.vdomain.add((passive_dns_out))
                     elif extended:
                         self.vdomain.append((domain, dnstype, answers[0]))
@@ -188,7 +216,7 @@ class Extract:
                     orig = self.__origin(ipaddr=dom[2])[1]
                 except:
                     continue
-                if(orig == cc):
+                if orig == cc:
                     self.localdom.append(dom)
             elif dom[1] == 'CNAME':
                 cname = str(dom[2])
@@ -197,7 +225,7 @@ class Extract:
                     orig = self.__origin(ipaddr=ip)[1]
                 except:
                     continue
-                if(orig == cc):
+                if orig == cc:
                     self.localdom.append(dom)
         return self.localdom
 
@@ -276,32 +304,37 @@ class Extract:
             if type(dom) == tuple:
                 dom = dom[0]
             if includefilter.search(dom):
-                    self.cleandomain.append(dom)
+                self.cleandomain.append(dom)
 
         return set(self.cleandomain)
 
+
 if __name__ == "__main__":
-    c = Extract(rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist")
-    c.text(rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be")
-    print (c.potentialdomain())
-    print (c.potentialdomain(validTLD=True))
-    print (c.validdomain(extended=True))
-    print ("US:")
-    print (c.localizedomain(cc='US'))
-    print ("LU:")
-    print (c.localizedomain(cc='LU'))
-    print ("BE:")
-    print (c.localizedomain(cc='BE'))
-    print ("Ranking:")
-    print (c.rankdomain())
-    print ("List of ip addresses:")
-    print (c.ipaddress(extended=False))
-    print ("Include dot.lu:")
-    print (c.include(expression=r'\.lu$'))
-    print ("Exclude dot.lu:")
-    print (c.exclude(expression=r'\.lu$'))
+    c = Extract(
+        rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist"
+    )
+    c.text(
+        rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be"
+    )
+    print(c.potentialdomain())
+    print(c.potentialdomain(validTLD=True))
+    print(c.validdomain(extended=True))
+    print("US:")
+    print(c.localizedomain(cc='US'))
+    print("LU:")
+    print(c.localizedomain(cc='LU'))
+    print("BE:")
+    print(c.localizedomain(cc='BE'))
+    print("Ranking:")
+    print(c.rankdomain())
+    print("List of ip addresses:")
+    print(c.ipaddress(extended=False))
+    print("Include dot.lu:")
+    print(c.include(expression=r'\.lu$'))
+    print("Exclude dot.lu:")
+    print(c.exclude(expression=r'\.lu$'))
     c.text(rawtext="www.lwn.net www.undeadly.org")
-    print (c.potentialdomain(validTLD=True))
+    print(c.potentialdomain(validTLD=True))
     c.validdomain()
-    print (c.localizedomain(cc='US'))
+    print(c.localizedomain(cc='US'))
     print(c.validdomain(extended=False, passive_dns=True))