From 526f88071c99e698ab2840ae75bd5883d304616e Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Thu, 20 Aug 2020 13:28:49 +0200 Subject: [PATCH] new: [feature] -s option to display the overall statistics of different tokens seen --- README.md | 3 ++- bin/napkin.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5199042..0e06076 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Analysis features are : # how to use napkin ~~~~ -usage: napkin.py [-h] [-v V] [-f F] [-t T] [-o O] +usage: napkin.py [-h] [-v V] [-f F] [-t T] [-s] [-o O] Extract statistical analysis of text @@ -31,6 +31,7 @@ optional arguments: -v V verbose output -f F file to analyse -t T maximum value for the top list (default is 100) -1 is no limit + -s display the overall statistics (default is False) -o O output format (default is csv) ~~~~ diff --git a/bin/napkin.py b/bin/napkin.py index 8695c9b..83fa89b 100644 --- a/bin/napkin.py +++ b/bin/napkin.py @@ -11,6 +11,7 @@ parser = argparse.ArgumentParser(description="Extract statistical analysis of te parser.add_argument('-v', help="verbose output") parser.add_argument('-f', help="file to analyse") parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100) +parser.add_argument('-s', help="display the overall statistics (default is False)", default=False, action='store_true') parser.add_argument('-o', help="output format (default is csv)", default="csv") args = parser.parse_args() if args.f is None: @@ -37,34 +38,45 @@ doc = nlp(text) analysis = ["verb:napkin", "noun:napkin", "hashtag:napkin", "mention:napkin", "digit:napkin", "url:napking", "oov:napkin", "labels:napkin"] +redisdb.hset("stats", "token", doc.__len__()) + for token in doc: if token.pos_ == "VERB" and not token.is_oov: redisdb.zincrby("verb:napkin", 1, token.lemma_) + redisdb.hincrby("stats", "verb:napkin", 1) continue if token.pos_ == "NOUN" and not token.is_oov: redisdb.zincrby("noun:napkin", 1, token.lemma_) + redisdb.hincrby("stats", "noun:napkin", 1) continue if token.is_oov: value = "{}".format(token) if value.startswith('#'): redisdb.zincrby("hashtag:napkin", 1, value[1:]) + redisdb.hincrby("stats", "hashtag:napkin", 1) continue if value.startswith('@'): redisdb.zincrby("mention:napkin", 1, value[1:]) + redisdb.hincrby("stats", "mention:napkin", 1) continue if token.is_digit: redisdb.zincrby("digit:napkin", 1, value) + redisdb.hincrby("stats", "digit:napkin", 1) continue if token.is_space: + redisdb.hincrby("stats", "space:napkin", 1) continue if token.like_url: redisdb.zincrby("url:napkin", 1, value) + redisdb.hincrby("stats", "url:napkin", 1) continue if token.like_email: redisdb.zincrby("email:napkin", 1, value) + redisdb.hincrby("stats", "email:napkin", 1) continue redisdb.zincrby("oov:napkin", 1, value) + redisdb.hincrby("stats", "oov:napkin", 1) for entity in doc.ents: @@ -78,3 +90,5 @@ for anal in analysis: print ("{},{}".format(a[0],a[1])) print ("#") +if args.s: + print (redisdb.hgetall('stats'))