chg: [domainclassifier] clean-up code

2024-11-22 09:57:07 +00:00 · 2022-07-30 15:51:06 +02:00 · 2022-07-30 15:51:06 +02:00 · 8debd6c6b7
commit 8debd6c6b7
parent 115c48f65c
1 changed files with 69 additions and 36 deletions
--- a/DomainClassifier/domainclassifier.py
+++ b/DomainClassifier/domainclassifier.py
@ -52,7 +52,11 @@ class Extract:
    def __origin(self, ipaddr=None):

        if ipaddr:
-            clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.', '.origin.asn.cymru.com')
+            clook = (
+                IPy.IP(str(ipaddr))
+                .reverseName()
+                .replace('.in-addr.arpa.', '.origin.asn.cymru.com')
+            )
            try:
                a = self.presolver.query(clook, 'TXT')
            except dns.resolver.NXDOMAIN:
@ -66,18 +70,25 @@ class Extract:
            return (x[0], x[2], a[0])
        else:
            return None
+
    """__bgpanking return the ranking the float value of an ASN.
    """
+
    def __bgpranking(self, asn=None):
        if asn:
            bgpranking = BGPRanking()
-            value = bgpranking.query(asn, date=(date.today() - timedelta(1)).isoformat())
+            value = bgpranking.query(
+                asn, date=(date.today() - timedelta(1)).isoformat()
+            )
            return value['response']['ranking']['rank']

    def __updatelisttld(self):
        ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
        req = urllib.Request(ianatldlist)
-        req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0')
+        req.add_header(
+            'User-Agent',
+            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0',
+        )
        tlds = (urllib.urlopen(req).read()).decode('utf8')
        tlds = tlds.split("\n")
        for tld in tlds:
@ -104,10 +115,12 @@ class Extract:
            self.vdomain = []
            return True
        return False
+
    """potentialdomain method extracts potential domains matching any
    string that is a serie of string with maximun 63 character separated by a
    dot. The method used the rawtext defined at the instantiation of the class.
    This return a list of a potential domain."""
+
    def potentialdomain(self, validTLD=True):
        self.domain = []
        domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
@ -124,7 +137,12 @@ class Extract:
    returns a list of existing domain. If the extended flag is true, a set is
    return with the associated DNS resources found."""

-    def validdomain(self, rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], extended=True, passive_dns=False):
+    def validdomain(
+        self,
+        rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'],
+        extended=True,
+        passive_dns=False,
+    ):
        if extended is False:
            self.vdomain = set()
        else:
@ -143,7 +161,17 @@ class Extract:
                        rrset = answers.rrset.to_text().splitlines()
                        for dns_resp in rrset:
                            dns_resp = dns_resp.split()
-                            passive_dns_out = '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(time.time(), self.presolver.nameservers[0], dns_resp[2], domain, dnstype, dns_resp[4], answers.ttl)
+                            passive_dns_out = (
+                                '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(
+                                    time.time(),
+                                    self.presolver.nameservers[0],
+                                    dns_resp[2],
+                                    domain,
+                                    dnstype,
+                                    dns_resp[4],
+                                    answers.ttl,
+                                )
+                            )
                            self.vdomain.add((passive_dns_out))
                    elif extended:
                        self.vdomain.append((domain, dnstype, answers[0]))
@ -188,7 +216,7 @@ class Extract:
                    orig = self.__origin(ipaddr=dom[2])[1]
                except:
                    continue
-                if(orig == cc):
+                if orig == cc:
                    self.localdom.append(dom)
            elif dom[1] == 'CNAME':
                cname = str(dom[2])
@ -197,7 +225,7 @@ class Extract:
                    orig = self.__origin(ipaddr=ip)[1]
                except:
                    continue
-                if(orig == cc):
+                if orig == cc:
                    self.localdom.append(dom)
        return self.localdom

@ -280,9 +308,14 @@ class Extract:

        return set(self.cleandomain)

+
 if __name__ == "__main__":
-    c = Extract(rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist")
-    c.text(rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be")
+    c = Extract(
+        rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist"
+    )
+    c.text(
+        rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be"
+    )
    print(c.potentialdomain())
    print(c.potentialdomain(validTLD=True))
    print(c.validdomain(extended=True))