mirror of
https://github.com/adulau/DomainClassifier.git
synced 2024-11-07 11:56:25 +00:00
First version of the domain extractor and classify
The class domainclassifer got two methods: - domain() to extract all potential domains from a raw text The method returns a list. - validdomain() returning all the existing domains based on their known DNS records sets like A,AAAA or CNAME records. The method returns a set. If the extended option is requested, it's a list of tuples containing the domain with their existing DNS records and their returned data.
This commit is contained in:
commit
9eb1e3e4ef
3 changed files with 38 additions and 0 deletions
0
DomainClassifier/__init__.py
Normal file
0
DomainClassifier/__init__.py
Normal file
33
DomainClassifier/domainclassifier.py
Normal file
33
DomainClassifier/domainclassifier.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
import re
|
||||||
|
import dns.resolver
|
||||||
|
|
||||||
|
class Extract:
|
||||||
|
def __init__(self, rawtext = None):
|
||||||
|
self.rawtext = rawtext
|
||||||
|
def domain(self):
|
||||||
|
self.domain = []
|
||||||
|
domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
|
||||||
|
for x in domain.findall(self.rawtext):
|
||||||
|
if x[0]:
|
||||||
|
self.domain.append(x[0])
|
||||||
|
|
||||||
|
return self.domain
|
||||||
|
|
||||||
|
def validdomain(self, rtype=['A','AAAA','SOA','MX','CNAME'], extended=None):
|
||||||
|
if extended is None:
|
||||||
|
self.validdomain = set()
|
||||||
|
else:
|
||||||
|
self.validdomain = []
|
||||||
|
for domain in self.domain:
|
||||||
|
for dnstype in rtype:
|
||||||
|
try:
|
||||||
|
answers = dns.resolver.query(domain, dnstype)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if extended is None:
|
||||||
|
self.validdomain.add((domain))
|
||||||
|
else:
|
||||||
|
self.validdomain.append((domain,dnstype,answers[0]))
|
||||||
|
return self.validdomain
|
||||||
|
|
5
DomainClassifier/test.py
Normal file
5
DomainClassifier/test.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
import domainclassifier
|
||||||
|
|
||||||
|
c = domainclassifier.Extract("this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be")
|
||||||
|
print c.domain()
|
||||||
|
print c.validdomain(extended=None)
|
Loading…
Reference in a new issue