diff --git a/README.md b/README.md index 0381a9e..eb16c34 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ DomainClassifier ================ DomainClassifier is a simple Python library to extract and classify Internet -domains/hostnames from raw text files following their existence, localization -or attributes. +domains/hostnames/IP addresses from raw text files following their existence, +localization or attributes. DomainClassifier can be used to extract Internet hosts from any free texts. @@ -15,7 +15,9 @@ How To Use It ```python import domainclassifier -c = domainclassifier.Extract( rawtext = "this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be http://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test") +c = domainclassifier.Extract( rawtext = "www.xxx.com this is a text with a domain called test@foo.lu another test abc.lu something a.b.c.d.e end of 1.2.3.4 foo.be www.belnet.be ht +tp://www.cert.be/ www.public.lu www.allo.lu quuxtest www.eurodns.com something-broken-www.google.com www.google.lu trailing test www.facebook.com www.nic.ru www.youporn.com 8.8.8. +8 201.1.1.1") # extracting potentially valid domains from rawtext print c.domain() @@ -34,21 +36,40 @@ print "BE:" print c.localizedomain(cc='BE') print "Ranking:" print c.rankdomain() + +# extract valid IPv4 addresses (using the potential list of valid domains) +print "List of ip addresses:" +print c.ipaddress(extended=True) + +# some more filtering +print "Include dot.lu:" +print c.include(expression=r'\.lu$') +print "Exclude dot.lu:" +print c.exclude(expression=r'\.lu$') ``` ### Sample output ```python -['foo.lu', 'abc.lu', 'a.b.c.d.e', '1.2.3.4', 'foo.be', 'www.belnet.be', 'www.cert.be', 'www.public.lu', 'www.allo.lu', 'www.eurodns.com', 'something-broken-www.google.com', 'www.google.lu'] -[('abc.lu', 'SOA', ), ('abc.lu', 'MX', ), ('foo.be', 'A', ), ('foo.be', 'AAAA', ), ('foo.be', 'SOA', ), ('foo.be', 'MX', ), ('www.belnet.be', 'A', ), ('www.belnet.be', 'AAAA', ), ('www.belnet.be', 'CNAME', ), ('www.cert.be', 'A', ), ('www.cert.be', 'AAAA', ), ('www.cert.be', 'SOA', ), ('www.cert.be', 'MX', ), ('www.cert.be', 'CNAME', ), ('www.public.lu', 'A', ), ('www.allo.lu', 'A', ), ('www.eurodns.com', 'A', ), ('www.google.lu', 'A', ), ('www.google.lu', 'CNAME', )] +['www.xxx.com', 'foo.lu', 'abc.lu', 'a.b.c.d.e', '1.2.3.4', 'foo.be', 'www.belnet.be', 'www.cert.be', 'www.public.lu', 'www.allo.lu', 'www.eurodns.com', 'something-broken-www.google.com', 'www.google.lu', 'www.facebook.com', 'www.nic.ru', 'www.youporn.com', '8.8.8.8', '201.1.1.1'] +[('www.xxx.com', 'A', ), ('abc.lu', 'SOA', ), ('abc.lu', 'MX', ), ('foo.be', 'A', ), ('foo.be', 'AAAA', ), ('foo.be', 'SOA', ), ('foo.be', 'MX', ), ('www.belnet.be', 'A', ), ('www.belnet.be', 'AAAA', ), ('www.belnet.be', 'CNAME', ), ('www.cert.be', 'A', ), ('www.cert.be', 'AAAA', ), ('www.cert.be', 'SOA', ), ('www.cert.be', 'MX', ), ('www.cert.be', 'CNAME', ), ('www.public.lu', 'A', ), ('www.allo.lu', 'A', ), ('www.eurodns.com', 'A', ), ('www.google.lu', 'A', ), ('www.google.lu', 'AAAA', ), ('www.facebook.com', 'A', ), ('www.facebook.com', 'AAAA', ), ('www.facebook.com', 'MX', ), ('www.facebook.com', 'CNAME', ), ('www.nic.ru', 'A', ), ('www.nic.ru', 'MX', ), ('www.youporn.com', 'A', ), ('www.youporn.com', 'SOA', ), ('www.youporn.com', 'MX', ), ('www.youporn.com', 'CNAME', )] US: -[('www.google.lu', 'A', ), ('www.google.lu', 'CNAME', )] +[('www.xxx.com', 'A', ), ('www.google.lu', 'A', )] LU: [('www.public.lu', 'A', ), ('www.allo.lu', 'A', ), ('www.eurodns.com', 'A', )] BE: -[('foo.be', 'A', ), ('www.belnet.be', 'A', ), ('www.belnet.be', 'CNAME', ), ('www.cert.be', 'A', ), ('www.cert.be', 'CNAME', )] +[('foo.be', 'A', ), ('www.belnet.be', 'A', ), ('www.belnet.be', 'CNAME', ), ('www.cert.be', 'A', ), ('www.cert.be', 'CNAME', )] Ranking: -[(1.0, 'foo.be'), (1.0000100806451599, 'www.belnet.be'), (1.0000100806451599, 'www.belnet.be'), (1.0000100806451599, 'www.cert.be'), (1.0000100806451599, 'www.cert.be'), (1.00021114864865, 'www.allo.lu'), (1.0002244274068299, 'www.public.lu'), (1.0002297794117601, 'www.eurodns.com'), (1.00338843724104, 'www.google.lu'), (1.00338843724104, 'www.google.lu')] +[(1.0, 'www.youporn.com'), (1.0, 'www.youporn.com'), (1.0000120563271599, 'www.belnet.be'), (1.0000120563271599, 'www.belnet.be'), (1.0000120563271599, 'www.cert.be'), (1.0000120563271599, 'www.cert.be'), (1.0000372023809501, 'foo.be'), (1.0001395089285701, 'www.public.lu'), (1.00015419407895, 'www.allo.lu'), (1.0003662109375, 'www.eurodns.com'), (1.0004111842105301, 'www.xxx.com'), (1.0005944293478299, 'www.nic.ru'), (1.0024646577381, 'www.facebook.com'), (1.0024646577381, 'www.facebook.com'), (1.002635288165, 'www.google.lu')] +List of ip addresses: +('15169', 'AU', ) +('15169', 'US', ) +('27699', 'BR', ) +set([('201.1.1.1', '(\'27699\', \'BR\', )'), ('8.8.8.8', '(\'15169\', \'US\', )'), ('1.2.3.4', '(\'15169\', \'AU\', )')]) +Include dot.lu: +['abc.lu', 'abc.lu', 'www.public.lu', 'www.allo.lu', 'www.google.lu', 'www.google.lu'] +Exclude dot.lu: +['www.xxx.com', 'foo.be', 'foo.be', 'foo.be', 'foo.be', 'www.belnet.be', 'www.belnet.be', 'www.belnet.be', 'www.cert.be', 'www.cert.be', 'www.cert.be', 'www.cert.be', 'www.cert.be', 'www.eurodns.com', 'www.facebook.com', 'www.facebook.com', 'www.facebook.com', 'www.facebook.com', 'www.nic.ru', 'www.nic.ru', 'www.youporn.com', 'www.youporn.com', 'www.youporn.com', 'www.youporn.com'] ``` ### Software Required