mirror of
https://github.com/adulau/DomainClassifier.git
synced 2024-11-22 09:57:07 +00:00
chg: [domainclassifier] clean-up code
This commit is contained in:
parent
115c48f65c
commit
8debd6c6b7
1 changed files with 69 additions and 36 deletions
|
@ -12,16 +12,16 @@ import time
|
|||
from datetime import date, timedelta
|
||||
|
||||
try:
|
||||
#python 3
|
||||
# python 3
|
||||
import urllib.request as urllib
|
||||
except:
|
||||
#python 2
|
||||
# python 2
|
||||
import urllib2 as urllib
|
||||
|
||||
try:
|
||||
from pybgpranking import BGPRanking
|
||||
except:
|
||||
print ("pybgpranking is not installed - ranking of ASN values won't be possible")
|
||||
print("pybgpranking is not installed - ranking of ASN values won't be possible")
|
||||
__author__ = "Alexandre Dulaunoy"
|
||||
__copyright__ = "Copyright 2012-2021, Alexandre Dulaunoy"
|
||||
__license__ = "AGPL version 3"
|
||||
|
@ -34,7 +34,7 @@ class Extract:
|
|||
from a rawtext stream. When call, the rawtext parameter is a string
|
||||
containing the raw data to be process."""
|
||||
|
||||
def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port= 53):
|
||||
def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port=53):
|
||||
self.rawtext = rawtext
|
||||
self.presolver = dns.resolver.Resolver()
|
||||
self.presolver.nameservers = nameservers
|
||||
|
@ -52,7 +52,11 @@ class Extract:
|
|||
def __origin(self, ipaddr=None):
|
||||
|
||||
if ipaddr:
|
||||
clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.', '.origin.asn.cymru.com')
|
||||
clook = (
|
||||
IPy.IP(str(ipaddr))
|
||||
.reverseName()
|
||||
.replace('.in-addr.arpa.', '.origin.asn.cymru.com')
|
||||
)
|
||||
try:
|
||||
a = self.presolver.query(clook, 'TXT')
|
||||
except dns.resolver.NXDOMAIN:
|
||||
|
@ -62,23 +66,30 @@ class Extract:
|
|||
if a:
|
||||
x = str(a[0]).split("|")
|
||||
# why so many spaces?
|
||||
x = list( map(lambda t: t.replace("\"", "").strip(), x) )
|
||||
x = list(map(lambda t: t.replace("\"", "").strip(), x))
|
||||
return (x[0], x[2], a[0])
|
||||
else:
|
||||
return None
|
||||
|
||||
"""__bgpanking return the ranking the float value of an ASN.
|
||||
"""
|
||||
|
||||
def __bgpranking(self, asn=None):
|
||||
if asn:
|
||||
bgpranking = BGPRanking()
|
||||
value = bgpranking.query(asn, date=(date.today() - timedelta(1)).isoformat())
|
||||
value = bgpranking.query(
|
||||
asn, date=(date.today() - timedelta(1)).isoformat()
|
||||
)
|
||||
return value['response']['ranking']['rank']
|
||||
|
||||
def __updatelisttld(self):
|
||||
ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
|
||||
req = urllib.Request(ianatldlist)
|
||||
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0')
|
||||
tlds = ( urllib.urlopen(req).read() ).decode('utf8')
|
||||
req.add_header(
|
||||
'User-Agent',
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0',
|
||||
)
|
||||
tlds = (urllib.urlopen(req).read()).decode('utf8')
|
||||
tlds = tlds.split("\n")
|
||||
for tld in tlds:
|
||||
self.listtld.append(tld.lower())
|
||||
|
@ -104,10 +115,12 @@ class Extract:
|
|||
self.vdomain = []
|
||||
return True
|
||||
return False
|
||||
|
||||
"""potentialdomain method extracts potential domains matching any
|
||||
string that is a serie of string with maximun 63 character separated by a
|
||||
dot. The method used the rawtext defined at the instantiation of the class.
|
||||
This return a list of a potential domain."""
|
||||
|
||||
def potentialdomain(self, validTLD=True):
|
||||
self.domain = []
|
||||
domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
|
||||
|
@ -124,7 +137,12 @@ class Extract:
|
|||
returns a list of existing domain. If the extended flag is true, a set is
|
||||
return with the associated DNS resources found."""
|
||||
|
||||
def validdomain(self, rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], extended=True, passive_dns=False):
|
||||
def validdomain(
|
||||
self,
|
||||
rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'],
|
||||
extended=True,
|
||||
passive_dns=False,
|
||||
):
|
||||
if extended is False:
|
||||
self.vdomain = set()
|
||||
else:
|
||||
|
@ -143,7 +161,17 @@ class Extract:
|
|||
rrset = answers.rrset.to_text().splitlines()
|
||||
for dns_resp in rrset:
|
||||
dns_resp = dns_resp.split()
|
||||
passive_dns_out = '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(time.time(), self.presolver.nameservers[0], dns_resp[2], domain, dnstype, dns_resp[4], answers.ttl)
|
||||
passive_dns_out = (
|
||||
'{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(
|
||||
time.time(),
|
||||
self.presolver.nameservers[0],
|
||||
dns_resp[2],
|
||||
domain,
|
||||
dnstype,
|
||||
dns_resp[4],
|
||||
answers.ttl,
|
||||
)
|
||||
)
|
||||
self.vdomain.add((passive_dns_out))
|
||||
elif extended:
|
||||
self.vdomain.append((domain, dnstype, answers[0]))
|
||||
|
@ -188,7 +216,7 @@ class Extract:
|
|||
orig = self.__origin(ipaddr=dom[2])[1]
|
||||
except:
|
||||
continue
|
||||
if(orig == cc):
|
||||
if orig == cc:
|
||||
self.localdom.append(dom)
|
||||
elif dom[1] == 'CNAME':
|
||||
cname = str(dom[2])
|
||||
|
@ -197,7 +225,7 @@ class Extract:
|
|||
orig = self.__origin(ipaddr=ip)[1]
|
||||
except:
|
||||
continue
|
||||
if(orig == cc):
|
||||
if orig == cc:
|
||||
self.localdom.append(dom)
|
||||
return self.localdom
|
||||
|
||||
|
@ -280,28 +308,33 @@ class Extract:
|
|||
|
||||
return set(self.cleandomain)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
c = Extract(rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist")
|
||||
c.text(rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be")
|
||||
print (c.potentialdomain())
|
||||
print (c.potentialdomain(validTLD=True))
|
||||
print (c.validdomain(extended=True))
|
||||
print ("US:")
|
||||
print (c.localizedomain(cc='US'))
|
||||
print ("LU:")
|
||||
print (c.localizedomain(cc='LU'))
|
||||
print ("BE:")
|
||||
print (c.localizedomain(cc='BE'))
|
||||
print ("Ranking:")
|
||||
print (c.rankdomain())
|
||||
print ("List of ip addresses:")
|
||||
print (c.ipaddress(extended=False))
|
||||
print ("Include dot.lu:")
|
||||
print (c.include(expression=r'\.lu$'))
|
||||
print ("Exclude dot.lu:")
|
||||
print (c.exclude(expression=r'\.lu$'))
|
||||
c = Extract(
|
||||
rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist"
|
||||
)
|
||||
c.text(
|
||||
rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be"
|
||||
)
|
||||
print(c.potentialdomain())
|
||||
print(c.potentialdomain(validTLD=True))
|
||||
print(c.validdomain(extended=True))
|
||||
print("US:")
|
||||
print(c.localizedomain(cc='US'))
|
||||
print("LU:")
|
||||
print(c.localizedomain(cc='LU'))
|
||||
print("BE:")
|
||||
print(c.localizedomain(cc='BE'))
|
||||
print("Ranking:")
|
||||
print(c.rankdomain())
|
||||
print("List of ip addresses:")
|
||||
print(c.ipaddress(extended=False))
|
||||
print("Include dot.lu:")
|
||||
print(c.include(expression=r'\.lu$'))
|
||||
print("Exclude dot.lu:")
|
||||
print(c.exclude(expression=r'\.lu$'))
|
||||
c.text(rawtext="www.lwn.net www.undeadly.org")
|
||||
print (c.potentialdomain(validTLD=True))
|
||||
print(c.potentialdomain(validTLD=True))
|
||||
c.validdomain()
|
||||
print (c.localizedomain(cc='US'))
|
||||
print(c.localizedomain(cc='US'))
|
||||
print(c.validdomain(extended=False, passive_dns=True))
|
||||
|
|
Loading…
Reference in a new issue