From 6e741ad37c20ad168b82e71e59ecfa67772a126d Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy <a@foo.be>
Date: Thu, 4 Sep 2014 22:03:05 +0200
Subject: [PATCH] text method added to classify multiple raw text

This used when you have a large set of raw texts that you have
to analyze and you want to avoid the initialisation part.
---
 DomainClassifier/domainclassifier.py | 51 ++++++++++++++++++----------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/DomainClassifier/domainclassifier.py b/DomainClassifier/domainclassifier.py
index c033c91..7e56728 100644
--- a/DomainClassifier/domainclassifier.py
+++ b/DomainClassifier/domainclassifier.py
@@ -13,7 +13,7 @@ import urllib2
 __author__ = "Alexandre Dulaunoy"
 __copyright__ = "Copyright 2012-2014, Alexandre Dulaunoy"
 __license__ = "AGPL version 3"
-__version__ = "0.4"
+__version__ = "0.5"
 
 
 class Extract:
@@ -101,6 +101,13 @@ class Extract:
 
         return self.cleandomain
 
+    def text(self, rawtext=False):
+        if rawtext:
+            self.rawtext = rawtext
+            self.domain = self.potentialdomain()
+            self.vdomain = []
+            return True
+        return False
     """potentialdomain method extracts potential domains matching any
     string that is a serie of string with maximun 63 character separated by a
     dot. The method used the rawtext defined at the instantiation of the class.
@@ -123,9 +130,9 @@ class Extract:
 
     def validdomain(self, rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], extended=True):
         if extended is False:
-            self.validdomain = set()
+            self.vdomain = set()
         else:
-            self.validdomain = []
+            self.vdomain = []
 
         for domain in self.domain:
             for dnstype in rtype:
@@ -136,10 +143,10 @@ class Extract:
                 else:
                     self.vdomain.append(domain)
                     if extended is False:
-                        self.validdomain.add((domain))
+                        self.vdomain.add((domain))
                     else:
-                        self.validdomain.append((domain, dnstype, answers[0]))
-        return self.validdomain
+                        self.vdomain.append((domain, dnstype, answers[0]))
+        return self.vdomain
 
     """ipaddress method extracts from the domain list the valid IPv4 addresses"""
 
@@ -171,7 +178,7 @@ class Extract:
     def localizedomain(self, cc=None):
         self.localdom = []
 
-        for dom in self.validdomain:
+        for dom in self.vdomain:
             if dom[1] == 'A':
                 ip = dom[2]
                 try:
@@ -199,8 +206,8 @@ class Extract:
     def rankdomain(self):
         self.rankdom = []
 
-        if self.validdomain:
-            for dom in self.validdomain:
+        if self.vdomain:
+            for dom in self.vdomain:
                 rank = None
                 asn = None
                 if dom[1] == 'A':
@@ -229,6 +236,9 @@ class Extract:
     """exclude domains from a regular expression. If validdomain was called,
     it's only on the valid domain list."""
 
+    """exclude domains from a regular expression. If validdomain was called,
+    it's only on the valid domain list."""
+
     def exclude(self, expression=None):
         self.cleandomain = []
 
@@ -240,15 +250,15 @@ class Extract:
             domains = self.vdomain
 
         for dom in domains:
+            if type(dom) == tuple:
+                dom = dom[0]
+
             if excludefilter.search(dom):
                 pass
             else:
                 self.cleandomain.append(dom)
         return self.cleandomain
 
-    """include domains from a regular expression. If validdomain was called,
-    it's only on the valid domain list."""
-
     def include(self, expression=None):
         self.cleandomain = []
 
@@ -260,14 +270,17 @@ class Extract:
             domains = self.vdomain
 
         for dom in domains:
-            if includefilter.search(dom):
-                self.cleandomain.append(dom)
+            if type(dom) == tuple:
+                dom = dom[0]
 
-        return self.cleandomain
+            if includefilter.search(dom):
+                    self.cleandomain.append(dom)
+
+        return set(self.cleandomain)
 
 if __name__ == "__main__":
-    c = Extract(rawtext="www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist", nameservers=['127.0.0.1'])
-
+    c = Extract(rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist", nameservers=['127.0.0.1'])
+    c.text(rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be")
     print (c.potentialdomain())
     print (c.potentialdomain(validTLD=True))
     print (c.validdomain(extended=True))
@@ -285,3 +298,7 @@ if __name__ == "__main__":
     print (c.include(expression=r'\.lu$'))
     print ("Exclude dot.lu:")
     print (c.exclude(expression=r'\.lu$'))
+    c.text(rawtext="www.lwn.net www.undeadly.org")
+    print (c.potentialdomain(validTLD=True))
+    c.validdomain()
+    print (c.localizedomain(cc='US'))