text method added to classify multiple raw text

This used when you have a large set of raw texts that you have
to analyze and you want to avoid the initialisation part.
This commit is contained in:
Alexandre Dulaunoy 2014-09-04 22:03:05 +02:00
parent 0ce9d7d1a8
commit 6e741ad37c

View file

@ -13,7 +13,7 @@ import urllib2
__author__ = "Alexandre Dulaunoy" __author__ = "Alexandre Dulaunoy"
__copyright__ = "Copyright 2012-2014, Alexandre Dulaunoy" __copyright__ = "Copyright 2012-2014, Alexandre Dulaunoy"
__license__ = "AGPL version 3" __license__ = "AGPL version 3"
__version__ = "0.4" __version__ = "0.5"
class Extract: class Extract:
@ -101,6 +101,13 @@ class Extract:
return self.cleandomain return self.cleandomain
def text(self, rawtext=False):
if rawtext:
self.rawtext = rawtext
self.domain = self.potentialdomain()
self.vdomain = []
return True
return False
"""potentialdomain method extracts potential domains matching any """potentialdomain method extracts potential domains matching any
string that is a serie of string with maximun 63 character separated by a string that is a serie of string with maximun 63 character separated by a
dot. The method used the rawtext defined at the instantiation of the class. dot. The method used the rawtext defined at the instantiation of the class.
@ -123,9 +130,9 @@ class Extract:
def validdomain(self, rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], extended=True): def validdomain(self, rtype=['A', 'AAAA', 'SOA', 'MX', 'CNAME'], extended=True):
if extended is False: if extended is False:
self.validdomain = set() self.vdomain = set()
else: else:
self.validdomain = [] self.vdomain = []
for domain in self.domain: for domain in self.domain:
for dnstype in rtype: for dnstype in rtype:
@ -136,10 +143,10 @@ class Extract:
else: else:
self.vdomain.append(domain) self.vdomain.append(domain)
if extended is False: if extended is False:
self.validdomain.add((domain)) self.vdomain.add((domain))
else: else:
self.validdomain.append((domain, dnstype, answers[0])) self.vdomain.append((domain, dnstype, answers[0]))
return self.validdomain return self.vdomain
"""ipaddress method extracts from the domain list the valid IPv4 addresses""" """ipaddress method extracts from the domain list the valid IPv4 addresses"""
@ -171,7 +178,7 @@ class Extract:
def localizedomain(self, cc=None): def localizedomain(self, cc=None):
self.localdom = [] self.localdom = []
for dom in self.validdomain: for dom in self.vdomain:
if dom[1] == 'A': if dom[1] == 'A':
ip = dom[2] ip = dom[2]
try: try:
@ -199,8 +206,8 @@ class Extract:
def rankdomain(self): def rankdomain(self):
self.rankdom = [] self.rankdom = []
if self.validdomain: if self.vdomain:
for dom in self.validdomain: for dom in self.vdomain:
rank = None rank = None
asn = None asn = None
if dom[1] == 'A': if dom[1] == 'A':
@ -229,6 +236,9 @@ class Extract:
"""exclude domains from a regular expression. If validdomain was called, """exclude domains from a regular expression. If validdomain was called,
it's only on the valid domain list.""" it's only on the valid domain list."""
"""exclude domains from a regular expression. If validdomain was called,
it's only on the valid domain list."""
def exclude(self, expression=None): def exclude(self, expression=None):
self.cleandomain = [] self.cleandomain = []
@ -240,15 +250,15 @@ class Extract:
domains = self.vdomain domains = self.vdomain
for dom in domains: for dom in domains:
if type(dom) == tuple:
dom = dom[0]
if excludefilter.search(dom): if excludefilter.search(dom):
pass pass
else: else:
self.cleandomain.append(dom) self.cleandomain.append(dom)
return self.cleandomain return self.cleandomain
"""include domains from a regular expression. If validdomain was called,
it's only on the valid domain list."""
def include(self, expression=None): def include(self, expression=None):
self.cleandomain = [] self.cleandomain = []
@ -260,14 +270,17 @@ class Extract:
domains = self.vdomain domains = self.vdomain
for dom in domains: for dom in domains:
if includefilter.search(dom): if type(dom) == tuple:
self.cleandomain.append(dom) dom = dom[0]
return self.cleandomain if includefilter.search(dom):
self.cleandomain.append(dom)
return set(self.cleandomain)
if __name__ == "__main__": if __name__ == "__main__":
c = Extract(rawtext="www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist", nameservers=['127.0.0.1']) c = Extract(rawtext="www.foo.lu www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8.8 201.1.1.1 abc.dontexist", nameservers=['127.0.0.1'])
c.text(rawtext="www.abc.lu www.xxx.com random text a test bric broc www.lemonde.fr www.belnet.be www.foo.be")
print (c.potentialdomain()) print (c.potentialdomain())
print (c.potentialdomain(validTLD=True)) print (c.potentialdomain(validTLD=True))
print (c.validdomain(extended=True)) print (c.validdomain(extended=True))
@ -285,3 +298,7 @@ if __name__ == "__main__":
print (c.include(expression=r'\.lu$')) print (c.include(expression=r'\.lu$'))
print ("Exclude dot.lu:") print ("Exclude dot.lu:")
print (c.exclude(expression=r'\.lu$')) print (c.exclude(expression=r'\.lu$'))
c.text(rawtext="www.lwn.net www.undeadly.org")
print (c.potentialdomain(validTLD=True))
c.validdomain()
print (c.localizedomain(cc='US'))