First version of the domain extractor and classify

The class domainclassifer got two methods:

- domain() to extract all potential domains from a raw text
  The method returns a list.

- validdomain() returning all the existing domains based on their
  known DNS records sets like A,AAAA or CNAME records.
  The method returns a set. If the extended option is requested, it's a list
  of tuples containing the domain with their existing DNS records and
  their returned data.
This commit is contained in:
Alexandre Dulaunoy 2012-01-23 16:16:01 +01:00
commit 9eb1e3e4ef
3 changed files with 38 additions and 0 deletions

View file

View file

@ -0,0 +1,33 @@
import re
import dns.resolver
class Extract:
def __init__(self, rawtext = None):
self.rawtext = rawtext
def domain(self):
self.domain = []
domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
for x in domain.findall(self.rawtext):
if x[0]:
self.domain.append(x[0])
return self.domain
def validdomain(self, rtype=['A','AAAA','SOA','MX','CNAME'], extended=None):
if extended is None:
self.validdomain = set()
else:
self.validdomain = []
for domain in self.domain:
for dnstype in rtype:
try:
answers = dns.resolver.query(domain, dnstype)
except:
pass
else:
if extended is None:
self.validdomain.add((domain))
else:
self.validdomain.append((domain,dnstype,answers[0]))
return self.validdomain

5
DomainClassifier/test.py Normal file
View file

@ -0,0 +1,5 @@
import domainclassifier
c = domainclassifier.Extract("this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be")
print c.domain()
print c.validdomain(extended=None)