commit 9eb1e3e4ef31d62d9bf431dab58ada9e27c87dcf Author: Alexandre Dulaunoy Date: Mon Jan 23 16:16:01 2012 +0100 First version of the domain extractor and classify The class domainclassifer got two methods: - domain() to extract all potential domains from a raw text The method returns a list. - validdomain() returning all the existing domains based on their known DNS records sets like A,AAAA or CNAME records. The method returns a set. If the extended option is requested, it's a list of tuples containing the domain with their existing DNS records and their returned data. diff --git a/DomainClassifier/__init__.py b/DomainClassifier/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py new file mode 100644 index 0000000..7ff349e --- /dev/null +++ b/DomainClassifier/domainclassifier.py @@ -0,0 +1,33 @@ +import re +import dns.resolver + +class Extract: + def __init__(self, rawtext = None): + self.rawtext = rawtext + def domain(self): + self.domain = [] + domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b') + for x in domain.findall(self.rawtext): + if x[0]: + self.domain.append(x[0]) + + return self.domain + + def validdomain(self, rtype=['A','AAAA','SOA','MX','CNAME'], extended=None): + if extended is None: + self.validdomain = set() + else: + self.validdomain = [] + for domain in self.domain: + for dnstype in rtype: + try: + answers = dns.resolver.query(domain, dnstype) + except: + pass + else: + if extended is None: + self.validdomain.add((domain)) + else: + self.validdomain.append((domain,dnstype,answers[0])) + return self.validdomain + diff --git a/DomainClassifier/test.py b/DomainClassifier/test.py new file mode 100644 index 0000000..8147c94 --- /dev/null +++ b/DomainClassifier/test.py @@ -0,0 +1,5 @@ +import domainclassifier + +c = domainclassifier.Extract("this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be") +print c.domain() +print c.validdomain(extended=None)