From 949e41d19f325b4e03101760fca6562e7b4cd002 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Thu, 8 Oct 2020 23:13:51 +0200 Subject: [PATCH] new: [lemmatized/verbatim] displaying verbatim or lemmatized version is now an option --- README.md | 9 +++++++-- bin/napkin.py | 13 ++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0e06076..1932b2a 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,8 @@ Analysis features are : - Mention frequency (everything prefixed with an @ symbol) - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary +Verbs and nouns are in their lemmatized form by default but the option `--verbatim` allows to keep the original inflection. + # requirements - Python >= 3.6 @@ -22,7 +24,7 @@ Analysis features are : # how to use napkin ~~~~ -usage: napkin.py [-h] [-v V] [-f F] [-t T] [-s] [-o O] +usage: napkin.py [-h] [-v V] [-f F] [-t T] [-s] [-o O] [-l L] [--verbatim] Extract statistical analysis of text @@ -32,7 +34,10 @@ optional arguments: -f F file to analyse -t T maximum value for the top list (default is 100) -1 is no limit -s display the overall statistics (default is False) - -o O output format (default is csv) + -o O output format (default is csv), json + -l L language used for the analysis (default is en) + --verbatim Don't use the lemmatized form, use verbatim. (default is the + lematized form) ~~~~ # example usage of napkin diff --git a/bin/napkin.py b/bin/napkin.py index ae1ef89..5be5c05 100644 --- a/bin/napkin.py +++ b/bin/napkin.py @@ -14,7 +14,8 @@ parser.add_argument('-f', help="file to analyse") parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100) parser.add_argument('-s', help="display the overall statistics (default is False)", default=False, action='store_true') parser.add_argument('-o', help="output format (default is csv), json", default="csv") -parser.add_argument("-l", help="language used for the analysis (default is en)", default="en") +parser.add_argument('-l', help="language used for the analysis (default is en)", default="en") +parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verbatim. (default is the lematized form)", default=False, action='store_true') args = parser.parse_args() if args.f is None: @@ -51,11 +52,17 @@ redisdb.hset("stats", "token", doc.__len__()) for token in doc: if token.pos_ == "VERB" and not token.is_oov: - redisdb.zincrby("verb:napkin", 1, token.lemma_) + if not args.verbatim: + redisdb.zincrby("verb:napkin", 1, token.lemma_) + else: + redisdb.zincrby("verb:napkin", 1, token.text) redisdb.hincrby("stats", "verb:napkin", 1) continue if token.pos_ == "NOUN" and not token.is_oov: - redisdb.zincrby("noun:napkin", 1, token.lemma_) + if not args.verbatim: + redisdb.zincrby("noun:napkin", 1, token.lemma_) + else: + redisdb.zincrby("noun:napkin", 1, token.text) redisdb.hincrby("stats", "noun:napkin", 1) continue