mirror of
https://github.com/adulau/DomainClassifier.git
synced 2024-11-25 19:37:07 +00:00
ValidTLD option added
A new default option has been introduced in DomainClassifier which is the validTLD option (enable by default). Based on the assigned list of TLD, the extraction of potential domains is filtered to the IANA assigned list. If you are extracting the data for non-assigned/internal TLDs, you can disable the default option with validTLD=False on the potentialdomain function. The list of assigned TLDs is downloaded from IANA.
This commit is contained in:
parent
83fafea305
commit
90fc808eec
1 changed files with 86 additions and 36 deletions
|
@ -8,12 +8,12 @@ import re
|
||||||
import dns.resolver
|
import dns.resolver
|
||||||
import IPy
|
import IPy
|
||||||
import socket
|
import socket
|
||||||
|
import urllib2
|
||||||
|
|
||||||
__author__ = "Alexandre Dulaunoy"
|
__author__ = "Alexandre Dulaunoy"
|
||||||
__copyright__ = "Copyright 2012-2014, Alexandre Dulaunoy"
|
__copyright__ = "Copyright 2012-2014, Alexandre Dulaunoy"
|
||||||
__license__ = "AGPL version 3"
|
__license__ = "AGPL version 3"
|
||||||
__version__ = "0.2"
|
__version__ = "0.3"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Extract:
|
class Extract:
|
||||||
|
@ -22,7 +22,6 @@ class Extract:
|
||||||
from a rawtext stream. When call, the rawtext parameter is a string
|
from a rawtext stream. When call, the rawtext parameter is a string
|
||||||
containing the raw data to be process."""
|
containing the raw data to be process."""
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, rawtext=None, nameservers=['8.8.8.8']):
|
def __init__(self, rawtext=None, nameservers=['8.8.8.8']):
|
||||||
self.rawtext = rawtext
|
self.rawtext = rawtext
|
||||||
self.presolver = dns.resolver.Resolver()
|
self.presolver = dns.resolver.Resolver()
|
||||||
|
@ -30,6 +29,7 @@ class Extract:
|
||||||
self.presolver.lifetime = 1.0
|
self.presolver.lifetime = 1.0
|
||||||
self.bgprankingserver = 'pdns.circl.lu'
|
self.bgprankingserver = 'pdns.circl.lu'
|
||||||
self.vdomain = []
|
self.vdomain = []
|
||||||
|
self.listtld = []
|
||||||
self.domain = self.potentialdomain()
|
self.domain = self.potentialdomain()
|
||||||
|
|
||||||
"""__origin is a private function to the ASN lookup for an IP address via
|
"""__origin is a private function to the ASN lookup for an IP address via
|
||||||
|
@ -40,9 +40,12 @@ class Extract:
|
||||||
|
|
||||||
if ipaddr:
|
if ipaddr:
|
||||||
clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.', '.origin.asn.cymru.com')
|
clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.', '.origin.asn.cymru.com')
|
||||||
try: a = self.presolver.query(clook, 'TXT')
|
try:
|
||||||
except dns.resolver.NXDOMAIN: return None
|
a = self.presolver.query(clook, 'TXT')
|
||||||
except dns.exception.Timeout: return None
|
except dns.resolver.NXDOMAIN:
|
||||||
|
return None
|
||||||
|
except dns.exception.Timeout:
|
||||||
|
return None
|
||||||
if a:
|
if a:
|
||||||
x = str(a[0]).split("|")
|
x = str(a[0]).split("|")
|
||||||
# why so many spaces?
|
# why so many spaces?
|
||||||
|
@ -65,8 +68,10 @@ class Extract:
|
||||||
break
|
break
|
||||||
s.close()
|
s.close()
|
||||||
if len(r) > 0:
|
if len(r) > 0:
|
||||||
try: rr = r.split("\n")[1].split(",")
|
try:
|
||||||
except IndexError: return None
|
rr = r.split("\n")[1].split(",")
|
||||||
|
except IndexError:
|
||||||
|
return None
|
||||||
if len(rr) > 1:
|
if len(rr) > 1:
|
||||||
rank = rr[1]
|
rank = rr[1]
|
||||||
return float(rank)
|
return float(rank)
|
||||||
|
@ -75,18 +80,39 @@ class Extract:
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def __updatelisttld(self):
|
||||||
|
ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
|
||||||
|
tlds = urllib2.urlopen(ianatldlist, ianatldlist).read()
|
||||||
|
tlds = tlds.split("\n")
|
||||||
|
for tld in tlds:
|
||||||
|
self.listtld.append(tld.lower())
|
||||||
|
|
||||||
|
def __listtld(self):
|
||||||
|
if not self.listtld:
|
||||||
|
self.__updatelisttld()
|
||||||
|
self.cleandomain = []
|
||||||
|
if self.domain is None:
|
||||||
|
return False
|
||||||
|
for domain in self.domain:
|
||||||
|
lastpart = domain.rsplit(".")[-1:][0]
|
||||||
|
for tld in self.listtld:
|
||||||
|
if lastpart == tld:
|
||||||
|
self.cleandomain.append(domain)
|
||||||
|
|
||||||
|
return self.cleandomain
|
||||||
|
|
||||||
"""potentialdomain method extracts potential domains matching any
|
"""potentialdomain method extracts potential domains matching any
|
||||||
string that is a serie of string with maximun 63 character separated by a
|
string that is a serie of string with maximun 63 character separated by a
|
||||||
dot. The method used the rawtext defined at the instantiation of the class.
|
dot. The method used the rawtext defined at the instantiation of the class.
|
||||||
This return a list of a potential domain."""
|
This return a list of a potential domain."""
|
||||||
|
def potentialdomain(self, validTLD=True):
|
||||||
def potentialdomain(self):
|
|
||||||
self.domain = []
|
self.domain = []
|
||||||
domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
|
domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
|
||||||
for x in domain.findall(self.rawtext):
|
for x in domain.findall(self.rawtext):
|
||||||
if x[0]:
|
if x[0]:
|
||||||
self.domain.append(x[0])
|
self.domain.append(x[0])
|
||||||
|
if validTLD:
|
||||||
|
self.domain = self.__listtld()
|
||||||
return self.domain
|
return self.domain
|
||||||
|
|
||||||
"""validdomain method used the extracted domains from the domain method to
|
"""validdomain method used the extracted domains from the domain method to
|
||||||
|
@ -151,7 +177,8 @@ class Extract:
|
||||||
orig = self.__origin(ipaddr=dom[2])[1]
|
orig = self.__origin(ipaddr=dom[2])[1]
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
if(orig == cc): self.localdom.append(dom)
|
if(orig == cc):
|
||||||
|
self.localdom.append(dom)
|
||||||
elif dom[1] == 'CNAME':
|
elif dom[1] == 'CNAME':
|
||||||
cname = str(dom[2])
|
cname = str(dom[2])
|
||||||
ip = socket.gethostbyname(cname)
|
ip = socket.gethostbyname(cname)
|
||||||
|
@ -159,7 +186,8 @@ class Extract:
|
||||||
orig = self.__origin(ipaddr=ip)[1]
|
orig = self.__origin(ipaddr=ip)[1]
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
if(orig == cc): self.localdom.append(dom)
|
if(orig == cc):
|
||||||
|
self.localdom.append(dom)
|
||||||
return self.localdom
|
return self.localdom
|
||||||
|
|
||||||
"""rankdomain method use the validdomain list (in extended format to rank
|
"""rankdomain method use the validdomain list (in extended format to rank
|
||||||
|
@ -184,17 +212,19 @@ class Extract:
|
||||||
self.rankdom.append(t)
|
self.rankdom.append(t)
|
||||||
elif dom[1] == 'CNAME':
|
elif dom[1] == 'CNAME':
|
||||||
cname = str(dom[2])
|
cname = str(dom[2])
|
||||||
try: ip = socket.gethostbyname(cname)
|
try:
|
||||||
except: continue
|
ip = socket.gethostbyname(cname)
|
||||||
try: asn = self.__origin(ipaddr=ip)[0]
|
except:
|
||||||
except TypeError: continue
|
continue
|
||||||
|
try:
|
||||||
|
asn = self.__origin(ipaddr=ip)[0]
|
||||||
|
except TypeError:
|
||||||
|
continue
|
||||||
rank = self.__bgpranking(asn)
|
rank = self.__bgpranking(asn)
|
||||||
t = (rank, dom[0])
|
t = (rank, dom[0])
|
||||||
self.rankdom.append(t)
|
self.rankdom.append(t)
|
||||||
return sorted(self.rankdom, key=lambda d: d[0])
|
return sorted(self.rankdom, key=lambda d: d[0])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""exclude domains from a regular expression. If validdomain was called,
|
"""exclude domains from a regular expression. If validdomain was called,
|
||||||
it's only on the valid domain list."""
|
it's only on the valid domain list."""
|
||||||
|
|
||||||
|
@ -234,4 +264,24 @@ class Extract:
|
||||||
|
|
||||||
return self.cleandomain
|
return self.cleandomain
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
c = Extract(rawtext="www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist", nameservers=['127.0.0.1'])
|
||||||
|
|
||||||
|
print c.potentialdomain()
|
||||||
|
print c.potentialdomain(validTLD=True)
|
||||||
|
print c.validdomain(extended=True)
|
||||||
|
print "US:"
|
||||||
|
print c.localizedomain(cc='US')
|
||||||
|
print "LU:"
|
||||||
|
print c.localizedomain(cc='LU')
|
||||||
|
print "BE:"
|
||||||
|
print c.localizedomain(cc='BE')
|
||||||
|
print "Ranking:"
|
||||||
|
print c.rankdomain()
|
||||||
|
print "List of ip addresses:"
|
||||||
|
print c.ipaddress(extended=True)
|
||||||
|
print "Include dot.lu:"
|
||||||
|
print c.include(expression=r'\.lu$')
|
||||||
|
print "Exclude dot.lu:"
|
||||||
|
print c.exclude(expression=r'\.lu$')
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue