First version of the domain extractor and classify

The class domainclassifer got two methods: - domain() to extract all potential domains from a raw text The method returns a list. - validdomain() returning all the existing domains based on their known DNS records sets like A,AAAA or CNAME records. The method returns a set. If the extended option is requested, it's a list of tuples containing the domain with their existing DNS records and their returned data.
2024-11-23 02:17:07 +00:00 · 2012-01-23 16:16:01 +01:00 · 2012-01-23 16:16:01 +01:00 · 9eb1e3e4ef
commit 9eb1e3e4ef
3 changed files with 38 additions and 0 deletions
--- a/DomainClassifier/init.py
+++ b/DomainClassifier/init.py
--- a/DomainClassifier/domainclassifier.py
+++ b/DomainClassifier/domainclassifier.py
@ -0,0 +1,33 @@
 import re
 import dns.resolver
 class Extract:
    def __init__(self, rawtext = None):
        self.rawtext = rawtext
    def domain(self):
        self.domain = []
        domain = re.compile(r'\b([a-zA-Z\d-]{,63}(\.[a-zA-Z\d-]{,63})+)\b')
        for x in domain.findall(self.rawtext):
            if x[0]:
                self.domain.append(x[0])
        return self.domain
    def validdomain(self, rtype=['A','AAAA','SOA','MX','CNAME'], extended=None):
        if extended is None:
            self.validdomain = set()
        else:
            self.validdomain = []
        for domain in self.domain:
            for dnstype in rtype:
                try:
                    answers = dns.resolver.query(domain, dnstype)
                except:
                    pass
                else:
                    if extended is None:
                        self.validdomain.add((domain))
                    else:
                        self.validdomain.append((domain,dnstype,answers[0]))
        return self.validdomain
--- a/DomainClassifier/test.py
+++ b/DomainClassifier/test.py
@ -0,0 +1,5 @@
 import domainclassifier
 c = domainclassifier.Extract("this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be")
 print c.domain()
 print c.validdomain(extended=None)