text method added to classify multiple raw text

This used when you have a large set of raw texts that you have to analyze and you want to avoid the initialisation part.
2024-11-22 18:07:07 +00:00 · 2014-09-04 22:03:05 +02:00 · 2014-09-04 22:03:05 +02:00 · 6e741ad37c
commit 6e741ad37c
parent 0ce9d7d1a8
1 changed files with 34 additions and 17 deletions
--- a/DomainClassifier/domainclassifier.py
+++ b/DomainClassifier/domainclassifier.py
@ -13,7 +13,7 @@ import urllib2
 __author__ = "Alexandre Dulaunoy"
 __copyright__ = "Copyright 2012-2014, Alexandre Dulaunoy"
 __license__ = "AGPL version 3"
-__version__ = "0.4"
+__version__ = "0.5"
 class Extract:
@ -101,6 +101,13 @@ class Extract:
        return self.cleandomain
    def text(self, rawtext=False):
        if rawtext:
            self.rawtext = rawtext
            self.domain = self.potentialdomain()
            self.vdomain = []
            return True
        return False
    """potentialdomain method extracts potential domains matching any
    string that is a serie of string with maximun 63 character separated by a
    dot. The method used the rawtext defined at the instantiation of the class.
@ -123,9 +130,9 @@ class Extract:
    def validdomain(self, rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], extended=True):
        if extended is False:
-            self.validdomain = set()
+            self.vdomain = set()
        else:
-            self.validdomain = []
+            self.vdomain = []
        for domain in self.domain:
            for dnstype in rtype:
@ -136,10 +143,10 @@ class Extract:
                else:
                    self.vdomain.append(domain)
                    if extended is False:
-                        self.validdomain.add((domain))
+                        self.vdomain.add((domain))
                    else:
-                        self.validdomain.append((domain, dnstype, answers[0]))
+                        self.vdomain.append((domain, dnstype, answers[0]))
-        return self.validdomain
+        return self.vdomain
    """ipaddress method extracts from the domain list the valid IPv4 addresses"""
@ -171,7 +178,7 @@ class Extract:
    def localizedomain(self, cc=None):
        self.localdom = []
-        for dom in self.validdomain:
+        for dom in self.vdomain:
            if dom[1] == 'A':
                ip = dom[2]
                try:
@ -199,8 +206,8 @@ class Extract:
    def rankdomain(self):
        self.rankdom = []
-        if self.validdomain:
+        if self.vdomain:
-            for dom in self.validdomain:
+            for dom in self.vdomain:
                rank = None
                asn = None
                if dom[1] == 'A':
@ -229,6 +236,9 @@ class Extract:
    """exclude domains from a regular expression. If validdomain was called,
    it's only on the valid domain list."""
    """exclude domains from a regular expression. If validdomain was called,
    it's only on the valid domain list."""
    def exclude(self, expression=None):
        self.cleandomain = []
@ -240,15 +250,15 @@ class Extract:
            domains = self.vdomain
        for dom in domains:
            if type(dom) == tuple:
                dom = dom[0]
            if excludefilter.search(dom):
                pass
            else:
                self.cleandomain.append(dom)
        return self.cleandomain
    """include domains from a regular expression. If validdomain was called,
    it's only on the valid domain list."""
    def include(self, expression=None):
        self.cleandomain = []
@ -260,14 +270,17 @@ class Extract:
            domains = self.vdomain
        for dom in domains:
-            if includefilter.search(dom):
+            if type(dom) == tuple:
-                self.cleandomain.append(dom)
+                dom = dom[0]
-        return self.cleandomain
+            if includefilter.search(dom):
                    self.cleandomain.append(dom)
        return set(self.cleandomain)
 if __name__ == "__main__":
-    c = Extract(rawtext="www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist", nameservers=['127.0.0.1'])
+    c = Extract(rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist", nameservers=['127.0.0.1'])
-
+    c.text(rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be")
    print (c.potentialdomain())
    print (c.potentialdomain(validTLD=True))
    print (c.validdomain(extended=True))
@ -285,3 +298,7 @@ if __name__ == "__main__":
    print (c.include(expression=r'\.lu$'))
    print ("Exclude dot.lu:")
    print (c.exclude(expression=r'\.lu$'))
    c.text(rawtext="www.lwn.net www.undeadly.org")
    print (c.potentialdomain(validTLD=True))
    c.validdomain()
    print (c.localizedomain(cc='US'))