From 90fc808eec5cfb1d4455c21904de1ce771b93163 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Sun, 31 Aug 2014 14:46:13 +0200 Subject: [PATCH] ValidTLD option added A new default option has been introduced in DomainClassifier which is the validTLD option (enable by default). Based on the assigned list of TLD, the extraction of potential domains is filtered to the IANA assigned list. If you are extracting the data for non-assigned/internal TLDs, you can disable the default option with validTLD=False on the potentialdomain function. The list of assigned TLDs is downloaded from IANA. --- DomainClassifier/domainclassifier.py | 122 +++++++++++++++++++-------- 1 file changed, 86 insertions(+), 36 deletions(-) diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py index 8c28e58..b858d1a 100644 --- a/DomainClassifier/domainclassifier.py +++ b/DomainClassifier/domainclassifier.py @@ -8,12 +8,12 @@ import re import dns.resolver import IPy import socket +import urllib2 __author__ = "Alexandre Dulaunoy" __copyright__ = "Copyright 2012-2014, Alexandre Dulaunoy" __license__ = "AGPL version 3" -__version__ = "0.2" - +__version__ = "0.3" class Extract: @@ -22,7 +22,6 @@ class Extract: from a rawtext stream. When call, the rawtext parameter is a string containing the raw data to be process.""" - def __init__(self, rawtext=None, nameservers=['8.8.8.8']): self.rawtext = rawtext self.presolver = dns.resolver.Resolver() @@ -30,6 +29,7 @@ class Extract: self.presolver.lifetime = 1.0 self.bgprankingserver = 'pdns.circl.lu' self.vdomain = [] + self.listtld = [] self.domain = self.potentialdomain() """__origin is a private function to the ASN lookup for an IP address via @@ -39,23 +39,26 @@ class Extract: def __origin(self, ipaddr=None): if ipaddr: - clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.','.origin.asn.cymru.com') - try: a = self.presolver.query(clook, 'TXT') - except dns.resolver.NXDOMAIN: return None - except dns.exception.Timeout: return None + clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.', '.origin.asn.cymru.com') + try: + a = self.presolver.query(clook, 'TXT') + except dns.resolver.NXDOMAIN: + return None + except dns.exception.Timeout: + return None if a: - x = str(a[0]).split("|") - # why so many spaces? - x = map (lambda t: t.replace("\"","").strip(), x) - return (x[0],x[2],a[0]) + x = str(a[0]).split("|") + # why so many spaces? + x = map(lambda t: t.replace("\"", "").strip(), x) + return (x[0], x[2], a[0]) else: - return None + return None """__bgpanking return the ranking the float value of an ASN. """ def __bgpranking(self, asn=None): if asn: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.connect((self.bgprankingserver,43)) + s.connect((self.bgprankingserver, 43)) s.send(asn+"\r\n") r = '' while True: @@ -65,28 +68,51 @@ class Extract: break s.close() if len(r) > 0: - try: rr = r.split("\n")[1].split(",") - except IndexError: return None - if len(rr) > 1: - rank = rr[1] - return float(rank) - else: - return None + try: + rr = r.split("\n")[1].split(",") + except IndexError: + return None + if len(rr) > 1: + rank = rr[1] + return float(rank) + else: + return None else: return None + def __updatelisttld(self): + ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt" + tlds = urllib2.urlopen(ianatldlist, ianatldlist).read() + tlds = tlds.split("\n") + for tld in tlds: + self.listtld.append(tld.lower()) + + def __listtld(self): + if not self.listtld: + self.__updatelisttld() + self.cleandomain = [] + if self.domain is None: + return False + for domain in self.domain: + lastpart = domain.rsplit(".")[-1:][0] + for tld in self.listtld: + if lastpart == tld: + self.cleandomain.append(domain) + + return self.cleandomain + """potentialdomain method extracts potential domains matching any string that is a serie of string with maximun 63 character separated by a dot. The method used the rawtext defined at the instantiation of the class. This return a list of a potential domain.""" - - def potentialdomain(self): + def potentialdomain(self, validTLD=True): self.domain = [] domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b') for x in domain.findall(self.rawtext): if x[0]: self.domain.append(x[0]) - + if validTLD: + self.domain = self.__listtld() return self.domain """validdomain method used the extracted domains from the domain method to @@ -95,7 +121,7 @@ class Extract: returns a list of existing domain. If the extended flag is true, a set is return with the associated DNS resources found.""" - def validdomain(self, rtype=['A','AAAA','SOA','MX','CNAME'], extended=True): + def validdomain(self, rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], extended=True): if extended is False: self.validdomain = set() else: @@ -112,7 +138,7 @@ class Extract: if extended is False: self.validdomain.add((domain)) else: - self.validdomain.append((domain,dnstype,answers[0])) + self.validdomain.append((domain, dnstype, answers[0])) return self.validdomain """ipaddress method extracts from the domain list the valid IPv4 addresses""" @@ -133,7 +159,7 @@ class Extract: self.ipaddresses.append((d)) else: orig = self.__origin(ipaddr=d) - self.ipaddresses.add((d,str(orig))) + self.ipaddresses.add((d, str(orig))) return self.ipaddresses @@ -151,7 +177,8 @@ class Extract: orig = self.__origin(ipaddr=dom[2])[1] except: continue - if(orig == cc): self.localdom.append(dom) + if(orig == cc): + self.localdom.append(dom) elif dom[1] == 'CNAME': cname = str(dom[2]) ip = socket.gethostbyname(cname) @@ -159,7 +186,8 @@ class Extract: orig = self.__origin(ipaddr=ip)[1] except: continue - if(orig == cc): self.localdom.append(dom) + if(orig == cc): + self.localdom.append(dom) return self.localdom """rankdomain method use the validdomain list (in extended format to rank @@ -184,21 +212,23 @@ class Extract: self.rankdom.append(t) elif dom[1] == 'CNAME': cname = str(dom[2]) - try: ip = socket.gethostbyname(cname) - except: continue - try: asn = self.__origin(ipaddr=ip)[0] - except TypeError: continue + try: + ip = socket.gethostbyname(cname) + except: + continue + try: + asn = self.__origin(ipaddr=ip)[0] + except TypeError: + continue rank = self.__bgpranking(asn) t = (rank, dom[0]) self.rankdom.append(t) return sorted(self.rankdom, key=lambda d: d[0]) - - """exclude domains from a regular expression. If validdomain was called, it's only on the valid domain list.""" - def exclude(self,expression=None): + def exclude(self, expression=None): self.cleandomain = [] excludefilter = re.compile(expression) @@ -218,7 +248,7 @@ class Extract: """include domains from a regular expression. If validdomain was called, it's only on the valid domain list.""" - def include(self,expression=None): + def include(self, expression=None): self.cleandomain = [] includefilter = re.compile(expression) @@ -234,4 +264,24 @@ class Extract: return self.cleandomain +if __name__ == "__main__": + c = Extract(rawtext="www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist", nameservers=['127.0.0.1']) + + print c.potentialdomain() + print c.potentialdomain(validTLD=True) + print c.validdomain(extended=True) + print "US:" + print c.localizedomain(cc='US') + print "LU:" + print c.localizedomain(cc='LU') + print "BE:" + print c.localizedomain(cc='BE') + print "Ranking:" + print c.rankdomain() + print "List of ip addresses:" + print c.ipaddress(extended=True) + print "Include dot.lu:" + print c.include(expression=r'\.lu$') + print "Exclude dot.lu:" + print c.exclude(expression=r'\.lu$')