ValidTLD option added

A new default option has been introduced in DomainClassifier which
is the validTLD option (enable by default). Based on the assigned
list of TLD, the extraction of potential domains is filtered to the
IANA assigned list.

If you are extracting the data for non-assigned/internal TLDs, you can disable
the default option with  validTLD=False on the potentialdomain function.

The list of assigned TLDs is downloaded from IANA.
This commit is contained in:
Alexandre Dulaunoy 2014-08-31 14:46:13 +02:00
parent 83fafea305
commit 90fc808eec

View file

@ -8,12 +8,12 @@ import re
import dns.resolver import dns.resolver
import IPy import IPy
import socket import socket
import urllib2
__author__ = "Alexandre Dulaunoy" __author__ = "Alexandre Dulaunoy"
__copyright__ = "Copyright 2012-2014, Alexandre Dulaunoy" __copyright__ = "Copyright 2012-2014, Alexandre Dulaunoy"
__license__ = "AGPL version 3" __license__ = "AGPL version 3"
__version__ = "0.2" __version__ = "0.3"
class Extract: class Extract:
@ -22,7 +22,6 @@ class Extract:
from a rawtext stream. When call, the rawtext parameter is a string from a rawtext stream. When call, the rawtext parameter is a string
containing the raw data to be process.""" containing the raw data to be process."""
def __init__(self, rawtext=None, nameservers=['8.8.8.8']): def __init__(self, rawtext=None, nameservers=['8.8.8.8']):
self.rawtext = rawtext self.rawtext = rawtext
self.presolver = dns.resolver.Resolver() self.presolver = dns.resolver.Resolver()
@ -30,6 +29,7 @@ class Extract:
self.presolver.lifetime = 1.0 self.presolver.lifetime = 1.0
self.bgprankingserver = 'pdns.circl.lu' self.bgprankingserver = 'pdns.circl.lu'
self.vdomain = [] self.vdomain = []
self.listtld = []
self.domain = self.potentialdomain() self.domain = self.potentialdomain()
"""__origin is a private function to the ASN lookup for an IP address via """__origin is a private function to the ASN lookup for an IP address via
@ -39,15 +39,18 @@ class Extract:
def __origin(self, ipaddr=None): def __origin(self, ipaddr=None):
if ipaddr: if ipaddr:
clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.','.origin.asn.cymru.com') clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.', '.origin.asn.cymru.com')
try: a = self.presolver.query(clook, 'TXT') try:
except dns.resolver.NXDOMAIN: return None a = self.presolver.query(clook, 'TXT')
except dns.exception.Timeout: return None except dns.resolver.NXDOMAIN:
return None
except dns.exception.Timeout:
return None
if a: if a:
x = str(a[0]).split("|") x = str(a[0]).split("|")
# why so many spaces? # why so many spaces?
x = map (lambda t: t.replace("\"","").strip(), x) x = map(lambda t: t.replace("\"", "").strip(), x)
return (x[0],x[2],a[0]) return (x[0], x[2], a[0])
else: else:
return None return None
"""__bgpanking return the ranking the float value of an ASN. """__bgpanking return the ranking the float value of an ASN.
@ -55,7 +58,7 @@ class Extract:
def __bgpranking(self, asn=None): def __bgpranking(self, asn=None):
if asn: if asn:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((self.bgprankingserver,43)) s.connect((self.bgprankingserver, 43))
s.send(asn+"\r\n") s.send(asn+"\r\n")
r = '' r = ''
while True: while True:
@ -65,8 +68,10 @@ class Extract:
break break
s.close() s.close()
if len(r) > 0: if len(r) > 0:
try: rr = r.split("\n")[1].split(",") try:
except IndexError: return None rr = r.split("\n")[1].split(",")
except IndexError:
return None
if len(rr) > 1: if len(rr) > 1:
rank = rr[1] rank = rr[1]
return float(rank) return float(rank)
@ -75,18 +80,39 @@ class Extract:
else: else:
return None return None
def __updatelisttld(self):
ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
tlds = urllib2.urlopen(ianatldlist, ianatldlist).read()
tlds = tlds.split("\n")
for tld in tlds:
self.listtld.append(tld.lower())
def __listtld(self):
if not self.listtld:
self.__updatelisttld()
self.cleandomain = []
if self.domain is None:
return False
for domain in self.domain:
lastpart = domain.rsplit(".")[-1:][0]
for tld in self.listtld:
if lastpart == tld:
self.cleandomain.append(domain)
return self.cleandomain
"""potentialdomain method extracts potential domains matching any """potentialdomain method extracts potential domains matching any
string that is a serie of string with maximun 63 character separated by a string that is a serie of string with maximun 63 character separated by a
dot. The method used the rawtext defined at the instantiation of the class. dot. The method used the rawtext defined at the instantiation of the class.
This return a list of a potential domain.""" This return a list of a potential domain."""
def potentialdomain(self, validTLD=True):
def potentialdomain(self):
self.domain = [] self.domain = []
domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b') domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
for x in domain.findall(self.rawtext): for x in domain.findall(self.rawtext):
if x[0]: if x[0]:
self.domain.append(x[0]) self.domain.append(x[0])
if validTLD:
self.domain = self.__listtld()
return self.domain return self.domain
"""validdomain method used the extracted domains from the domain method to """validdomain method used the extracted domains from the domain method to
@ -95,7 +121,7 @@ class Extract:
returns a list of existing domain. If the extended flag is true, a set is returns a list of existing domain. If the extended flag is true, a set is
return with the associated DNS resources found.""" return with the associated DNS resources found."""
def validdomain(self, rtype=['A','AAAA','SOA','MX','CNAME'], extended=True): def validdomain(self, rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], extended=True):
if extended is False: if extended is False:
self.validdomain = set() self.validdomain = set()
else: else:
@ -112,7 +138,7 @@ class Extract:
if extended is False: if extended is False:
self.validdomain.add((domain)) self.validdomain.add((domain))
else: else:
self.validdomain.append((domain,dnstype,answers[0])) self.validdomain.append((domain, dnstype, answers[0]))
return self.validdomain return self.validdomain
"""ipaddress method extracts from the domain list the valid IPv4 addresses""" """ipaddress method extracts from the domain list the valid IPv4 addresses"""
@ -133,7 +159,7 @@ class Extract:
self.ipaddresses.append((d)) self.ipaddresses.append((d))
else: else:
orig = self.__origin(ipaddr=d) orig = self.__origin(ipaddr=d)
self.ipaddresses.add((d,str(orig))) self.ipaddresses.add((d, str(orig)))
return self.ipaddresses return self.ipaddresses
@ -151,7 +177,8 @@ class Extract:
orig = self.__origin(ipaddr=dom[2])[1] orig = self.__origin(ipaddr=dom[2])[1]
except: except:
continue continue
if(orig == cc): self.localdom.append(dom) if(orig == cc):
self.localdom.append(dom)
elif dom[1] == 'CNAME': elif dom[1] == 'CNAME':
cname = str(dom[2]) cname = str(dom[2])
ip = socket.gethostbyname(cname) ip = socket.gethostbyname(cname)
@ -159,7 +186,8 @@ class Extract:
orig = self.__origin(ipaddr=ip)[1] orig = self.__origin(ipaddr=ip)[1]
except: except:
continue continue
if(orig == cc): self.localdom.append(dom) if(orig == cc):
self.localdom.append(dom)
return self.localdom return self.localdom
"""rankdomain method use the validdomain list (in extended format to rank """rankdomain method use the validdomain list (in extended format to rank
@ -184,21 +212,23 @@ class Extract:
self.rankdom.append(t) self.rankdom.append(t)
elif dom[1] == 'CNAME': elif dom[1] == 'CNAME':
cname = str(dom[2]) cname = str(dom[2])
try: ip = socket.gethostbyname(cname) try:
except: continue ip = socket.gethostbyname(cname)
try: asn = self.__origin(ipaddr=ip)[0] except:
except TypeError: continue continue
try:
asn = self.__origin(ipaddr=ip)[0]
except TypeError:
continue
rank = self.__bgpranking(asn) rank = self.__bgpranking(asn)
t = (rank, dom[0]) t = (rank, dom[0])
self.rankdom.append(t) self.rankdom.append(t)
return sorted(self.rankdom, key=lambda d: d[0]) return sorted(self.rankdom, key=lambda d: d[0])
"""exclude domains from a regular expression. If validdomain was called, """exclude domains from a regular expression. If validdomain was called,
it's only on the valid domain list.""" it's only on the valid domain list."""
def exclude(self,expression=None): def exclude(self, expression=None):
self.cleandomain = [] self.cleandomain = []
excludefilter = re.compile(expression) excludefilter = re.compile(expression)
@ -218,7 +248,7 @@ class Extract:
"""include domains from a regular expression. If validdomain was called, """include domains from a regular expression. If validdomain was called,
it's only on the valid domain list.""" it's only on the valid domain list."""
def include(self,expression=None): def include(self, expression=None):
self.cleandomain = [] self.cleandomain = []
includefilter = re.compile(expression) includefilter = re.compile(expression)
@ -234,4 +264,24 @@ class Extract:
return self.cleandomain return self.cleandomain
if __name__ == "__main__":
c = Extract(rawtext="www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist", nameservers=['127.0.0.1'])
print c.potentialdomain()
print c.potentialdomain(validTLD=True)
print c.validdomain(extended=True)
print "US:"
print c.localizedomain(cc='US')
print "LU:"
print c.localizedomain(cc='LU')
print "BE:"
print c.localizedomain(cc='BE')
print "Ranking:"
print c.rankdomain()
print "List of ip addresses:"
print c.ipaddress(extended=True)
print "Include dot.lu:"
print c.include(expression=r'\.lu$')
print "Exclude dot.lu:"
print c.exclude(expression=r'\.lu$')