mirror of
https://github.com/adulau/DomainClassifier.git
synced 2024-11-22 18:07:07 +00:00
Merge branch 'master' of github.com:adulau/DomainClassifier
This commit is contained in:
commit
bcf5f88a48
4 changed files with 178 additions and 65 deletions
|
@ -1 +1 @@
|
||||||
__version__ = "1.0"
|
__version__ = "1.1"
|
||||||
|
|
|
@ -7,25 +7,31 @@ attributes.
|
||||||
import re
|
import re
|
||||||
import dns.resolver
|
import dns.resolver
|
||||||
import IPy
|
import IPy
|
||||||
|
import redis
|
||||||
import socket
|
import socket
|
||||||
import time
|
import time
|
||||||
from datetime import date, timedelta
|
from datetime import date, timedelta
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
|
from multiprocessing import Process as Proc
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#python 3
|
# python 3
|
||||||
import urllib.request as urllib
|
import urllib.request as urllib
|
||||||
except:
|
except:
|
||||||
#python 2
|
# python 2
|
||||||
import urllib2 as urllib
|
import urllib2 as urllib
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from pybgpranking import BGPRanking
|
from pybgpranking import BGPRanking
|
||||||
except:
|
except:
|
||||||
print ("pybgpranking is not installed - ranking of ASN values won't be possible")
|
print("pybgpranking is not installed - ranking of ASN values won't be possible")
|
||||||
__author__ = "Alexandre Dulaunoy"
|
__author__ = "Alexandre Dulaunoy"
|
||||||
__copyright__ = "Copyright 2012-2021, Alexandre Dulaunoy"
|
__copyright__ = "Copyright 2012-2024, Alexandre Dulaunoy"
|
||||||
__license__ = "AGPL version 3"
|
__license__ = "AGPL version 3"
|
||||||
__version__ = "0.9"
|
__version__ = "1.1"
|
||||||
|
|
||||||
|
|
||||||
class Extract:
|
class Extract:
|
||||||
|
@ -34,15 +40,26 @@ class Extract:
|
||||||
from a rawtext stream. When call, the rawtext parameter is a string
|
from a rawtext stream. When call, the rawtext parameter is a string
|
||||||
containing the raw data to be process."""
|
containing the raw data to be process."""
|
||||||
|
|
||||||
def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port= 53):
|
def __init__(self, rawtext=None, nameservers=['8.8.8.8'], port=53, redis_host='', redis_port=6379, redis_db=0, expire_time=3600, re_timeout=-1):
|
||||||
self.rawtext = rawtext
|
self.rawtext = rawtext
|
||||||
self.presolver = dns.resolver.Resolver()
|
self.presolver = dns.resolver.Resolver()
|
||||||
self.presolver.nameservers = nameservers
|
self.presolver.nameservers = nameservers
|
||||||
self.presolver.port = 53
|
self.presolver.port = port
|
||||||
self.presolver.lifetime = 1.0
|
self.presolver.lifetime = 1.0
|
||||||
self.bgprankingserver = 'pdns.circl.lu'
|
self.bgprankingserver = 'pdns.circl.lu'
|
||||||
self.vdomain = []
|
self.vdomain = []
|
||||||
self.listtld = []
|
self.listtld = []
|
||||||
|
|
||||||
|
self.re_domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
|
||||||
|
|
||||||
|
if redis_host and redis_port:
|
||||||
|
self.redis = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_db, decode_responses=True)
|
||||||
|
self.uuid = str(uuid4())
|
||||||
|
self.re_timeout = re_timeout
|
||||||
|
else:
|
||||||
|
self.redis = None
|
||||||
|
self.expire_time = expire_time
|
||||||
|
|
||||||
self.domain = self.potentialdomain()
|
self.domain = self.potentialdomain()
|
||||||
|
|
||||||
"""__origin is a private function to the ASN lookup for an IP address via
|
"""__origin is a private function to the ASN lookup for an IP address via
|
||||||
|
@ -52,7 +69,11 @@ class Extract:
|
||||||
def __origin(self, ipaddr=None):
|
def __origin(self, ipaddr=None):
|
||||||
|
|
||||||
if ipaddr:
|
if ipaddr:
|
||||||
clook = IPy.IP(str(ipaddr)).reverseName().replace('.in-addr.arpa.', '.origin.asn.cymru.com')
|
clook = (
|
||||||
|
IPy.IP(str(ipaddr))
|
||||||
|
.reverseName()
|
||||||
|
.replace('.in-addr.arpa.', '.origin.asn.cymru.com')
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
a = self.presolver.query(clook, 'TXT')
|
a = self.presolver.query(clook, 'TXT')
|
||||||
except dns.resolver.NXDOMAIN:
|
except dns.resolver.NXDOMAIN:
|
||||||
|
@ -62,23 +83,44 @@ class Extract:
|
||||||
if a:
|
if a:
|
||||||
x = str(a[0]).split("|")
|
x = str(a[0]).split("|")
|
||||||
# why so many spaces?
|
# why so many spaces?
|
||||||
x = list( map(lambda t: t.replace("\"", "").strip(), x) )
|
x = list(map(lambda t: t.replace("\"", "").strip(), x))
|
||||||
return (x[0], x[2], a[0])
|
return (x[0], x[2], a[0])
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
"""__bgpanking return the ranking the float value of an ASN.
|
"""__bgpanking return the ranking the float value of an ASN.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __bgpranking(self, asn=None):
|
def __bgpranking(self, asn=None):
|
||||||
if asn:
|
if asn:
|
||||||
bgpranking = BGPRanking()
|
bgpranking = BGPRanking()
|
||||||
value = bgpranking.query(asn, date=(date.today() - timedelta(1)).isoformat())
|
value = bgpranking.query(
|
||||||
|
asn, date=(date.today() - timedelta(1)).isoformat()
|
||||||
|
)
|
||||||
return value['response']['ranking']['rank']
|
return value['response']['ranking']['rank']
|
||||||
|
|
||||||
def __updatelisttld(self):
|
def __updatelisttld(self, force=False):
|
||||||
ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
|
ianatldlist = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
|
||||||
req = urllib.Request(ianatldlist)
|
userdir = os.path.expanduser("~")
|
||||||
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0')
|
cachedir = os.path.join(userdir, ".DomainClassifier")
|
||||||
tlds = ( urllib.urlopen(req).read() ).decode('utf8')
|
if not os.path.exists(cachedir):
|
||||||
|
os.mkdir(cachedir)
|
||||||
|
tldcache = os.path.join(cachedir, "tlds")
|
||||||
|
if not os.path.exists(tldcache):
|
||||||
|
print(tldcache)
|
||||||
|
req = urllib.Request(ianatldlist)
|
||||||
|
req.add_header(
|
||||||
|
'User-Agent',
|
||||||
|
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0',
|
||||||
|
)
|
||||||
|
tlds = (urllib.urlopen(req).read()).decode('utf8')
|
||||||
|
f = open(tldcache, "wb")
|
||||||
|
f.write(tlds.encode("utf-8"))
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
f = open(tldcache, "r")
|
||||||
|
tlds = f.read()
|
||||||
|
f.close()
|
||||||
tlds = tlds.split("\n")
|
tlds = tlds.split("\n")
|
||||||
for tld in tlds:
|
for tld in tlds:
|
||||||
self.listtld.append(tld.lower())
|
self.listtld.append(tld.lower())
|
||||||
|
@ -97,23 +139,53 @@ class Extract:
|
||||||
|
|
||||||
return self.cleandomain
|
return self.cleandomain
|
||||||
|
|
||||||
def text(self, rawtext=False):
|
def __re_findall(self, rawtext):
|
||||||
|
for x in re.findall(self.re_domain, rawtext):
|
||||||
|
if x[0]:
|
||||||
|
self.redis.sadd('cache:regex:{}'.format(self.uuid), x[0])
|
||||||
|
self.redis.expire('cache:regex:{}'.format(self.uuid), 360)
|
||||||
|
|
||||||
|
def __regex_findall(self, rawtext, timeout):
|
||||||
|
proc = Proc(target=self.__re_findall, args=(rawtext,))
|
||||||
|
try:
|
||||||
|
proc.start()
|
||||||
|
proc.join(timeout)
|
||||||
|
if proc.is_alive():
|
||||||
|
proc.terminate()
|
||||||
|
print('regex: processing timeout')
|
||||||
|
return []
|
||||||
|
else:
|
||||||
|
domains = self.redis.smembers('cache:regex:{}'.format(self.uuid))
|
||||||
|
self.redis.delete('cache:regex:{}'.format(self.uuid))
|
||||||
|
proc.terminate()
|
||||||
|
return domains
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("Caught KeyboardInterrupt, terminating workers")
|
||||||
|
proc.terminate()
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
def text(self, rawtext=''):
|
||||||
if rawtext:
|
if rawtext:
|
||||||
self.rawtext = rawtext
|
self.rawtext = rawtext
|
||||||
self.domain = self.potentialdomain()
|
self.domain = self.potentialdomain()
|
||||||
self.vdomain = []
|
self.vdomain = []
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
"""potentialdomain method extracts potential domains matching any
|
"""potentialdomain method extracts potential domains matching any
|
||||||
string that is a serie of string with maximun 63 character separated by a
|
string that is a serie of string with maximum 63 character separated by a
|
||||||
dot. The method used the rawtext defined at the instantiation of the class.
|
dot. The method used the rawtext defined at the instantiation of the class.
|
||||||
This return a list of a potential domain."""
|
This return a list of a potential domain."""
|
||||||
|
|
||||||
def potentialdomain(self, validTLD=True):
|
def potentialdomain(self, validTLD=True):
|
||||||
self.domain = []
|
self.domain = []
|
||||||
domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
|
if self.re_timeout > 0 and self.redis:
|
||||||
for x in domain.findall(self.rawtext):
|
self.domain = list(self.__regex_findall(self.rawtext, self.re_timeout))
|
||||||
if x[0]:
|
else:
|
||||||
self.domain.append(x[0])
|
domains = self.re_domain.findall(self.rawtext)
|
||||||
|
for x in domains:
|
||||||
|
if x[0]:
|
||||||
|
self.domain.append(x[0])
|
||||||
if validTLD:
|
if validTLD:
|
||||||
self.domain = self.__listtld()
|
self.domain = self.__listtld()
|
||||||
return self.domain
|
return self.domain
|
||||||
|
@ -124,31 +196,66 @@ class Extract:
|
||||||
returns a list of existing domain. If the extended flag is true, a set is
|
returns a list of existing domain. If the extended flag is true, a set is
|
||||||
return with the associated DNS resources found."""
|
return with the associated DNS resources found."""
|
||||||
|
|
||||||
def validdomain(self, rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], extended=True, passive_dns=False):
|
def validdomain(
|
||||||
|
self,
|
||||||
|
rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'],
|
||||||
|
extended=True,
|
||||||
|
passive_dns=False,
|
||||||
|
):
|
||||||
if extended is False:
|
if extended is False:
|
||||||
self.vdomain = set()
|
self.vdomain = set()
|
||||||
else:
|
else:
|
||||||
self.vdomain = []
|
self.vdomain = []
|
||||||
|
|
||||||
for domain in self.domain:
|
for domain in self.domain:
|
||||||
for dnstype in rtype:
|
if self.redis:
|
||||||
try:
|
if self.redis.exists('dom_class:cache:{}'.format(domain)):
|
||||||
answers = self.presolver.query(domain, dnstype)
|
passive_dns_out = self.redis.smembers('dom_class:cache:{}'.format(domain))
|
||||||
except:
|
for out in passive_dns_out:
|
||||||
pass
|
if extended:
|
||||||
else:
|
out = tuple(out.split('[^]', 2))
|
||||||
# Pasive DNS output
|
self.vdomain.append(out)
|
||||||
# timestamp||dns-client ||dns-server||RR class||Query||Query Type||Answer||TTL||Count
|
else:
|
||||||
if passive_dns:
|
self.vdomain.add(out)
|
||||||
rrset = answers.rrset.to_text().splitlines()
|
else:
|
||||||
for dns_resp in rrset:
|
|
||||||
dns_resp = dns_resp.split()
|
for dnstype in rtype:
|
||||||
passive_dns_out = '{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(time.time(), self.presolver.nameservers[0], dns_resp[2], domain, dnstype, dns_resp[4], answers.ttl)
|
try:
|
||||||
self.vdomain.add((passive_dns_out))
|
answers = self.presolver.query(domain, dnstype)
|
||||||
elif extended:
|
except:
|
||||||
self.vdomain.append((domain, dnstype, answers[0]))
|
pass
|
||||||
else:
|
else:
|
||||||
self.vdomain.add((domain))
|
# Passive DNS output
|
||||||
|
# timestamp||dns-client ||dns-server||RR class||Query||Query Type||Answer||TTL||Count
|
||||||
|
if passive_dns:
|
||||||
|
rrset = answers.rrset.to_text().splitlines()
|
||||||
|
for dns_resp in rrset:
|
||||||
|
dns_resp = dns_resp.split()
|
||||||
|
passive_dns_out = (
|
||||||
|
'{}||127.0.0.1||{}||{}||{}||{}||{}||{}||1\n'.format(
|
||||||
|
time.time(),
|
||||||
|
self.presolver.nameservers[0],
|
||||||
|
dns_resp[2],
|
||||||
|
domain,
|
||||||
|
dnstype,
|
||||||
|
dns_resp[4],
|
||||||
|
answers.ttl,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.vdomain.add((passive_dns_out))
|
||||||
|
if self.redis:
|
||||||
|
self.redis.sadd('dom_class:cache:{}'.format(domain), passive_dns_out)
|
||||||
|
self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time)
|
||||||
|
elif extended:
|
||||||
|
self.vdomain.append((domain, dnstype, answers[0]))
|
||||||
|
if self.redis:
|
||||||
|
self.redis.sadd('dom_class:cache:{}'.format(domain), '{}[^]{}[^]{}'.format(domain, dnstype, answers[0]))
|
||||||
|
self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time)
|
||||||
|
else:
|
||||||
|
self.vdomain.add((domain))
|
||||||
|
if self.redis:
|
||||||
|
self.redis.sadd('dom_class:cache:{}'.format(domain), domain)
|
||||||
|
self.redis.expire('dom_class:cache:{}'.format(domain), self.expire_time)
|
||||||
return self.vdomain
|
return self.vdomain
|
||||||
|
|
||||||
"""ipaddress method extracts from the domain list the valid IPv4 addresses"""
|
"""ipaddress method extracts from the domain list the valid IPv4 addresses"""
|
||||||
|
@ -188,7 +295,7 @@ class Extract:
|
||||||
orig = self.__origin(ipaddr=dom[2])[1]
|
orig = self.__origin(ipaddr=dom[2])[1]
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
if(orig == cc):
|
if orig == cc:
|
||||||
self.localdom.append(dom)
|
self.localdom.append(dom)
|
||||||
elif dom[1] == 'CNAME':
|
elif dom[1] == 'CNAME':
|
||||||
cname = str(dom[2])
|
cname = str(dom[2])
|
||||||
|
@ -197,7 +304,7 @@ class Extract:
|
||||||
orig = self.__origin(ipaddr=ip)[1]
|
orig = self.__origin(ipaddr=ip)[1]
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
if(orig == cc):
|
if orig == cc:
|
||||||
self.localdom.append(dom)
|
self.localdom.append(dom)
|
||||||
return self.localdom
|
return self.localdom
|
||||||
|
|
||||||
|
@ -276,32 +383,37 @@ class Extract:
|
||||||
if type(dom) == tuple:
|
if type(dom) == tuple:
|
||||||
dom = dom[0]
|
dom = dom[0]
|
||||||
if includefilter.search(dom):
|
if includefilter.search(dom):
|
||||||
self.cleandomain.append(dom)
|
self.cleandomain.append(dom)
|
||||||
|
|
||||||
return set(self.cleandomain)
|
return set(self.cleandomain)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
c = Extract(rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist")
|
c = Extract(
|
||||||
c.text(rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be")
|
rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist"
|
||||||
print (c.potentialdomain())
|
)
|
||||||
print (c.potentialdomain(validTLD=True))
|
c.text(
|
||||||
print (c.validdomain(extended=True))
|
rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be"
|
||||||
print ("US:")
|
)
|
||||||
print (c.localizedomain(cc='US'))
|
print(c.potentialdomain())
|
||||||
print ("LU:")
|
print(c.potentialdomain(validTLD=True))
|
||||||
print (c.localizedomain(cc='LU'))
|
print(c.validdomain(extended=True))
|
||||||
print ("BE:")
|
print("US:")
|
||||||
print (c.localizedomain(cc='BE'))
|
print(c.localizedomain(cc='US'))
|
||||||
print ("Ranking:")
|
print("LU:")
|
||||||
print (c.rankdomain())
|
print(c.localizedomain(cc='LU'))
|
||||||
print ("List of ip addresses:")
|
print("BE:")
|
||||||
print (c.ipaddress(extended=False))
|
print(c.localizedomain(cc='BE'))
|
||||||
print ("Include dot.lu:")
|
print("Ranking:")
|
||||||
print (c.include(expression=r'\.lu$'))
|
print(c.rankdomain())
|
||||||
print ("Exclude dot.lu:")
|
print("List of ip addresses:")
|
||||||
print (c.exclude(expression=r'\.lu$'))
|
print(c.ipaddress(extended=False))
|
||||||
|
print("Include dot.lu:")
|
||||||
|
print(c.include(expression=r'\.lu$'))
|
||||||
|
print("Exclude dot.lu:")
|
||||||
|
print(c.exclude(expression=r'\.lu$'))
|
||||||
c.text(rawtext="www.lwn.net www.undeadly.org")
|
c.text(rawtext="www.lwn.net www.undeadly.org")
|
||||||
print (c.potentialdomain(validTLD=True))
|
print(c.potentialdomain(validTLD=True))
|
||||||
c.validdomain()
|
c.validdomain()
|
||||||
print (c.localizedomain(cc='US'))
|
print(c.localizedomain(cc='US'))
|
||||||
print(c.validdomain(extended=False, passive_dns=True))
|
print(c.validdomain(extended=False, passive_dns=True))
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
IPy
|
IPy
|
||||||
dnspython
|
dnspython
|
||||||
git+https://github.com/D4-project/BGP-Ranking.git/#egg=pybgpranking&subdirectory=client
|
git+https://github.com/D4-project/BGP-Ranking.git/#egg=pybgpranking&subdirectory=client
|
||||||
|
redis
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -1,7 +1,7 @@
|
||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
setup(
|
setup(
|
||||||
name="DomainClassifier",
|
name="DomainClassifier",
|
||||||
version="1.0",
|
version="1.1",
|
||||||
packages=find_packages(),
|
packages=find_packages(),
|
||||||
install_requires=['dnspython', 'IPy', 'pybgpranking'],
|
install_requires=['dnspython', 'IPy', 'pybgpranking'],
|
||||||
dependency_links=[
|
dependency_links=[
|
||||||
|
|
Loading…
Reference in a new issue