diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py index 8c28e58..b858d1a 100644 --- a/DomainClassifier/domainclassifier.py +++ b/DomainClassifier/domainclassifier.py @@ -8,12 +8,12 @@ import re import dns.resolver import IPy import socket +import urllib2 __author__ = "Alexandre Dulaunoy" __copyright__ = "Copyright 2012-2014, Alexandre Dulaunoy" __license__ = "AGPL version 3" -__version__ = "0.2" - +__version__ = "0.3" class Extract: @@ -22,7 +22,6 @@ class Extract: from a rawtext stream. When call, the rawtext parameter is a string containing the raw data to be process.""" - def __init__(self, rawtext=None, nameservers=['8.8.8.8']): self.rawtext = rawtext self.presolver = dns.resolver.Resolver() @@ -30,6 +29,7 @@ class Extract: self.presolver.lifetime = 1.0 self.bgprankingserver = 'pdns.circl.lu' self.vdomain = [] + self.listtld = [] self.domain = self.potentialdomain() """__origin is a private function to the ASN lookup for an IP address via @@ -39,23 +39,26 @@ class Extract: def __origin(self, ipaddr=None): if ipaddr: - clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.','.origin.asn.cymru.com') - try: a = self.presolver.query(clook, 'TXT') - except dns.resolver.NXDOMAIN: return None - except dns.exception.Timeout: return None + clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.', '.origin.asn.cymru.com') + try: + a = self.presolver.query(clook, 'TXT') + except dns.resolver.NXDOMAIN: + return None + except dns.exception.Timeout: + return None if a: - x = str(a[0]).split("|") - # why so many spaces? - x = map (lambda t: t.replace("\"","").strip(), x) - return (x[0],x[2],a[0]) + x = str(a[0]).split("|") + # why so many spaces? + x = map(lambda t: t.replace("\"", "").strip(), x) + return (x[0], x[2], a[0]) else: - return None + return None """__bgpanking return the ranking the float value of an ASN. """ def __bgpranking(self, asn=None): if asn: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.connect((self.bgprankingserver,43)) + s.connect((self.bgprankingserver, 43)) s.send(asn+"\r\n") r = '' while True: @@ -65,28 +68,51 @@ class Extract: break s.close() if len(r) > 0: - try: rr = r.split("\n")[1].split(",") - except IndexError: return None - if len(rr) > 1: - rank = rr[1] - return float(rank) - else: - return None + try: + rr = r.split("\n")[1].split(",") + except IndexError: + return None + if len(rr) > 1: + rank = rr[1] + return float(rank) + else: + return None else: return None + def __updatelisttld(self): + ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt" + tlds = urllib2.urlopen(ianatldlist, ianatldlist).read() + tlds = tlds.split("\n") + for tld in tlds: + self.listtld.append(tld.lower()) + + def __listtld(self): + if not self.listtld: + self.__updatelisttld() + self.cleandomain = [] + if self.domain is None: + return False + for domain in self.domain: + lastpart = domain.rsplit(".")[-1:][0] + for tld in self.listtld: + if lastpart == tld: + self.cleandomain.append(domain) + + return self.cleandomain + """potentialdomain method extracts potential domains matching any string that is a serie of string with maximun 63 character separated by a dot. The method used the rawtext defined at the instantiation of the class. This return a list of a potential domain.""" - - def potentialdomain(self): + def potentialdomain(self, validTLD=True): self.domain = [] domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b') for x in domain.findall(self.rawtext): if x[0]: self.domain.append(x[0]) - + if validTLD: + self.domain = self.__listtld() return self.domain """validdomain method used the extracted domains from the domain method to @@ -95,7 +121,7 @@ class Extract: returns a list of existing domain. If the extended flag is true, a set is return with the associated DNS resources found.""" - def validdomain(self, rtype=['A','AAAA','SOA','MX','CNAME'], extended=True): + def validdomain(self, rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], extended=True): if extended is False: self.validdomain = set() else: @@ -112,7 +138,7 @@ class Extract: if extended is False: self.validdomain.add((domain)) else: - self.validdomain.append((domain,dnstype,answers[0])) + self.validdomain.append((domain, dnstype, answers[0])) return self.validdomain """ipaddress method extracts from the domain list the valid IPv4 addresses""" @@ -133,7 +159,7 @@ class Extract: self.ipaddresses.append((d)) else: orig = self.__origin(ipaddr=d) - self.ipaddresses.add((d,str(orig))) + self.ipaddresses.add((d, str(orig))) return self.ipaddresses @@ -151,7 +177,8 @@ class Extract: orig = self.__origin(ipaddr=dom[2])[1] except: continue - if(orig == cc): self.localdom.append(dom) + if(orig == cc): + self.localdom.append(dom) elif dom[1] == 'CNAME': cname = str(dom[2]) ip = socket.gethostbyname(cname) @@ -159,7 +186,8 @@ class Extract: orig = self.__origin(ipaddr=ip)[1] except: continue - if(orig == cc): self.localdom.append(dom) + if(orig == cc): + self.localdom.append(dom) return self.localdom """rankdomain method use the validdomain list (in extended format to rank @@ -184,21 +212,23 @@ class Extract: self.rankdom.append(t) elif dom[1] == 'CNAME': cname = str(dom[2]) - try: ip = socket.gethostbyname(cname) - except: continue - try: asn = self.__origin(ipaddr=ip)[0] - except TypeError: continue + try: + ip = socket.gethostbyname(cname) + except: + continue + try: + asn = self.__origin(ipaddr=ip)[0] + except TypeError: + continue rank = self.__bgpranking(asn) t = (rank, dom[0]) self.rankdom.append(t) return sorted(self.rankdom, key=lambda d: d[0]) - - """exclude domains from a regular expression. If validdomain was called, it's only on the valid domain list.""" - def exclude(self,expression=None): + def exclude(self, expression=None): self.cleandomain = [] excludefilter = re.compile(expression) @@ -218,7 +248,7 @@ class Extract: """include domains from a regular expression. If validdomain was called, it's only on the valid domain list.""" - def include(self,expression=None): + def include(self, expression=None): self.cleandomain = [] includefilter = re.compile(expression) @@ -234,4 +264,24 @@ class Extract: return self.cleandomain +if __name__ == "__main__": + c = Extract(rawtext="www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist", nameservers=['127.0.0.1']) + + print c.potentialdomain() + print c.potentialdomain(validTLD=True) + print c.validdomain(extended=True) + print "US:" + print c.localizedomain(cc='US') + print "LU:" + print c.localizedomain(cc='LU') + print "BE:" + print c.localizedomain(cc='BE') + print "Ranking:" + print c.rankdomain() + print "List of ip addresses:" + print c.ipaddress(extended=True) + print "Include dot.lu:" + print c.include(expression=r'\.lu$') + print "Exclude dot.lu:" + print c.exclude(expression=r'\.lu$')