napkin-text-analysis/bin/napkin.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import redis
import spacy
from spacy_langdetect import LanguageDetector
import argparse
import sys
import simplejson as json

parser = argparse.ArgumentParser(description="Extract statistical analysis of text")
parser.add_argument('-v', help="verbose output")
parser.add_argument('-f', help="file to analyse")
parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100)
parser.add_argument('-s', help="display the overall statistics (default is False)", default=False,  action='store_true')
parser.add_argument('-o', help="output format (default is csv), json", default="csv")
parser.add_argument('-l', help="language used for the analysis (default is en)", default="en")
parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verbatim. (default is the lematized form)", default=False, action='store_true')
parser.add_argument('--no-flushdb', help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)", default=False, action='store_true')

args = parser.parse_args()
if args.f is None:
    parser.print_help()
    sys.exit()

redisdb = redis.Redis(host="localhost", port=6380, db=5)

try:
    redisdb.ping()
except:
    print("Redis database on port 6380 is not running...", file=sys.stderr)
    sys.exit()

if not args.no_flushdb:
    redisdb.flushdb()

if args.l == "fr":
    nlp = spacy.load("fr_core_news_md")
else:
    nlp = spacy.load("en_core_web_md")

nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

nlp.max_length = 2000000

with open(args.f, 'r') as file:
    text = file.read()

doc = nlp(text)

analysis = ["verb:napkin", "noun:napkin", "hashtag:napkin", "mention:napkin",
            "digit:napkin", "url:napking", "oov:napkin", "labels:napkin",
            "punct:napkin"]

redisdb.hset("stats", "token", doc.__len__())

for token in doc:
        if token.pos_ == "VERB" and not token.is_oov:
            if not args.verbatim:
                redisdb.zincrby("verb:napkin", 1, token.lemma_)
            else:
                redisdb.zincrby("verb:napkin", 1, token.text)
            redisdb.hincrby("stats", "verb:napkin", 1)
            continue
        if token.pos_ == "NOUN" and not token.is_oov:
            if not args.verbatim:
                redisdb.zincrby("noun:napkin", 1, token.lemma_)
            else:
                redisdb.zincrby("noun:napkin", 1, token.text)
            redisdb.hincrby("stats", "noun:napkin", 1)
            continue

        if token.is_oov:
            value = "{}".format(token)
            if value.startswith('#'):
                redisdb.zincrby("hashtag:napkin", 1, value[1:])
                redisdb.hincrby("stats", "hashtag:napkin", 1)
                continue
            if value.startswith('@'):
                redisdb.zincrby("mention:napkin", 1, value[1:])
                redisdb.hincrby("stats", "mention:napkin", 1)
                continue
            if token.is_digit:
                redisdb.zincrby("digit:napkin", 1, value)
                redisdb.hincrby("stats", "digit:napkin", 1)
                continue
            if token.is_space:
                redisdb.hincrby("stats", "space:napkin", 1)
                continue
            if token.like_url:
                redisdb.zincrby("url:napkin", 1, value)
                redisdb.hincrby("stats", "url:napkin", 1)
                continue
            if token.like_email:
                redisdb.zincrby("email:napkin", 1, value)
                redisdb.hincrby("stats", "email:napkin", 1)
                continue
            if token.is_punct:
                redisdb.zincrby("punct:napkin", 1, value)
                redisdb.hincrby("stats", "punct:napkin", 1)
                continue

            redisdb.zincrby("oov:napkin", 1, value)
            redisdb.hincrby("stats", "oov:napkin", 1)


for entity in doc.ents:
        redisdb.zincrby("labels:napkin", 1, entity.label_)

if args.o == "json":
    output_json = {"format":"napkin"}
for anal in analysis:
        x = redisdb.zrevrange(anal, 1, args.t, withscores=True)
        if args.o == "csv":
            print ("# Top {} of {}".format(args.t, anal))
        elif args.o == "json":
            output_json.update({anal:[]})
        for a in x:
            if args.o == "csv":
                print ("{},{}".format(a[0],a[1]))
            elif args.o == "json":
                output_json[anal].append(a)
        if args.o == "csv":
            print ("#")

if args.s:
    print (redisdb.hgetall('stats'))
if args.o == "json":
    print(json.dumps(output_json))
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

			`import redis`
			`import spacy`
			`from spacy_langdetect import LanguageDetector`
			`import argparse`
			`import sys`
new: [output] JSON export added 2020-09-21 05:50:57 +00:00			`import simplejson as json`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00
			`parser = argparse.ArgumentParser(description="Extract statistical analysis of text")`
			`parser.add_argument('-v', help="verbose output")`
			`parser.add_argument('-f', help="file to analyse")`
			`parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100)`
new: [feature] -s option to display the overall statistics of different tokens seen 2020-08-20 11:28:49 +00:00			`parser.add_argument('-s', help="display the overall statistics (default is False)", default=False, action='store_true')`
new: [output] JSON export added 2020-09-21 05:50:57 +00:00			`parser.add_argument('-o', help="output format (default is csv), json", default="csv")`
new: [lemmatized/verbatim] displaying verbatim or lemmatized version is now an option 2020-10-08 21:13:51 +00:00			`parser.add_argument('-l', help="language used for the analysis (default is en)", default="en")`
			`parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verbatim. (default is the lematized form)", default=False, action='store_true')`
new: [option] Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. 2020-10-08 21:22:00 +00:00			`parser.add_argument('--no-flushdb', help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)", default=False, action='store_true')`
chg: [args] add an option to force the language 2020-10-01 21:06:39 +00:00
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`args = parser.parse_args()`
			`if args.f is None:`
			`parser.print_help()`
			`sys.exit()`

			`redisdb = redis.Redis(host="localhost", port=6380, db=5)`

			`try:`
new: [option] Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. 2020-10-08 21:22:00 +00:00			`redisdb.ping()`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`except:`
			`print("Redis database on port 6380 is not running...", file=sys.stderr)`
			`sys.exit()`

new: [option] Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. 2020-10-08 21:22:00 +00:00			`if not args.no_flushdb:`
			`redisdb.flushdb()`

chg: [args] add an option to force the language 2020-10-01 21:06:39 +00:00			`if args.l == "fr":`
			`nlp = spacy.load("fr_core_news_md")`
			`else:`
			`nlp = spacy.load("en_core_web_md")`

new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)`

			`nlp.max_length = 2000000`

			`with open(args.f, 'r') as file:`
			`text = file.read()`

			`doc = nlp(text)`

chg: [feature] add punct statistics for the oov (but the punct in spacy.io seems super buggy or incorrect) 2020-08-20 12:33:15 +00:00			`analysis = ["verb:napkin", "noun:napkin", "hashtag:napkin", "mention:napkin",`
			`"digit:napkin", "url:napking", "oov:napkin", "labels:napkin",`
			`"punct:napkin"]`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00
new: [feature] -s option to display the overall statistics of different tokens seen 2020-08-20 11:28:49 +00:00			`redisdb.hset("stats", "token", doc.__len__())`

new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`for token in doc:`
			`if token.pos_ == "VERB" and not token.is_oov:`
new: [lemmatized/verbatim] displaying verbatim or lemmatized version is now an option 2020-10-08 21:13:51 +00:00			`if not args.verbatim:`
			`redisdb.zincrby("verb:napkin", 1, token.lemma_)`
			`else:`
			`redisdb.zincrby("verb:napkin", 1, token.text)`
new: [feature] -s option to display the overall statistics of different tokens seen 2020-08-20 11:28:49 +00:00			`redisdb.hincrby("stats", "verb:napkin", 1)`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`continue`
			`if token.pos_ == "NOUN" and not token.is_oov:`
new: [lemmatized/verbatim] displaying verbatim or lemmatized version is now an option 2020-10-08 21:13:51 +00:00			`if not args.verbatim:`
			`redisdb.zincrby("noun:napkin", 1, token.lemma_)`
			`else:`
			`redisdb.zincrby("noun:napkin", 1, token.text)`
new: [feature] -s option to display the overall statistics of different tokens seen 2020-08-20 11:28:49 +00:00			`redisdb.hincrby("stats", "noun:napkin", 1)`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`continue`

			`if token.is_oov:`
			`value = "{}".format(token)`
			`if value.startswith('#'):`
			`redisdb.zincrby("hashtag:napkin", 1, value[1:])`
new: [feature] -s option to display the overall statistics of different tokens seen 2020-08-20 11:28:49 +00:00			`redisdb.hincrby("stats", "hashtag:napkin", 1)`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`continue`
			`if value.startswith('@'):`
			`redisdb.zincrby("mention:napkin", 1, value[1:])`
new: [feature] -s option to display the overall statistics of different tokens seen 2020-08-20 11:28:49 +00:00			`redisdb.hincrby("stats", "mention:napkin", 1)`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`continue`
			`if token.is_digit:`
			`redisdb.zincrby("digit:napkin", 1, value)`
new: [feature] -s option to display the overall statistics of different tokens seen 2020-08-20 11:28:49 +00:00			`redisdb.hincrby("stats", "digit:napkin", 1)`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`continue`
			`if token.is_space:`
new: [feature] -s option to display the overall statistics of different tokens seen 2020-08-20 11:28:49 +00:00			`redisdb.hincrby("stats", "space:napkin", 1)`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`continue`
			`if token.like_url:`
			`redisdb.zincrby("url:napkin", 1, value)`
new: [feature] -s option to display the overall statistics of different tokens seen 2020-08-20 11:28:49 +00:00			`redisdb.hincrby("stats", "url:napkin", 1)`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`continue`
			`if token.like_email:`
			`redisdb.zincrby("email:napkin", 1, value)`
new: [feature] -s option to display the overall statistics of different tokens seen 2020-08-20 11:28:49 +00:00			`redisdb.hincrby("stats", "email:napkin", 1)`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`continue`
chg: [feature] add punct statistics for the oov (but the punct in spacy.io seems super buggy or incorrect) 2020-08-20 12:33:15 +00:00			`if token.is_punct:`
			`redisdb.zincrby("punct:napkin", 1, value)`
			`redisdb.hincrby("stats", "punct:napkin", 1)`
			`continue`

new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`redisdb.zincrby("oov:napkin", 1, value)`
new: [feature] -s option to display the overall statistics of different tokens seen 2020-08-20 11:28:49 +00:00			`redisdb.hincrby("stats", "oov:napkin", 1)`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00

			`for entity in doc.ents:`
			`redisdb.zincrby("labels:napkin", 1, entity.label_)`

new: [output] JSON export added 2020-09-21 05:50:57 +00:00			`if args.o == "json":`
			`output_json = {"format":"napkin"}`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`for anal in analysis:`
			`x = redisdb.zrevrange(anal, 1, args.t, withscores=True)`
new: [output] JSON export added 2020-09-21 05:50:57 +00:00			`if args.o == "csv":`
			`print ("# Top {} of {}".format(args.t, anal))`
			`elif args.o == "json":`
			`output_json.update({anal:[]})`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00			`for a in x:`
			`if args.o == "csv":`
			`print ("{},{}".format(a[0],a[1]))`
new: [output] JSON export added 2020-09-21 05:50:57 +00:00			`elif args.o == "json":`
			`output_json[anal].append(a)`
			`if args.o == "csv":`
			`print ("#")`
new: [napkin] first release Napkin is a Python tool to produce statistical analysis of a text. Analysis features are : - Verbs frequency - Nouns frequency - Digit frequency - Labels frequency such as (Person, organisation, product, location) as defined in spacy.io [named entities](https://spacy.io/api/annotation#named-entities) - URL frequency - Email frequency - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary 2020-08-19 15:33:04 +00:00
new: [feature] -s option to display the overall statistics of different tokens seen 2020-08-20 11:28:49 +00:00			`if args.s:`
			`print (redisdb.hgetall('stats'))`
new: [output] JSON export added 2020-09-21 05:50:57 +00:00			`if args.o == "json":`
			`print(json.dumps(output_json))`