From 949e41d19f325b4e03101760fca6562e7b4cd002 Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy <a@foo.be>
Date: Thu, 8 Oct 2020 23:13:51 +0200
Subject: [PATCH] new: [lemmatized/verbatim] displaying verbatim or lemmatized
 version is now an option

---
 README.md     |  9 +++++++--
 bin/napkin.py | 13 ++++++++++---
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 0e06076..1932b2a 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,8 @@ Analysis features are :
 - Mention frequency (everything prefixed with an @ symbol)
 - Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary
 
+Verbs and nouns are in their lemmatized form by default but the option `--verbatim` allows to keep the original inflection.
+
 # requirements
 
 - Python >= 3.6
@@ -22,7 +24,7 @@ Analysis features are :
 # how to use napkin
 
 ~~~~
-usage: napkin.py [-h] [-v V] [-f F] [-t T] [-s] [-o O]
+usage: napkin.py [-h] [-v V] [-f F] [-t T] [-s] [-o O] [-l L] [--verbatim]
 
 Extract statistical analysis of text
 
@@ -32,7 +34,10 @@ optional arguments:
   -f F        file to analyse
   -t T        maximum value for the top list (default is 100) -1 is no limit
   -s          display the overall statistics (default is False)
-  -o O        output format (default is csv)
+  -o O        output format (default is csv), json
+  -l L        language used for the analysis (default is en)
+  --verbatim  Don't use the lemmatized form, use verbatim. (default is the
+              lematized form)
 ~~~~
 
 # example usage of napkin
diff --git a/bin/napkin.py b/bin/napkin.py
index ae1ef89..5be5c05 100644
--- a/bin/napkin.py
+++ b/bin/napkin.py
@@ -14,7 +14,8 @@ parser.add_argument('-f', help="file to analyse")
 parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100)
 parser.add_argument('-s', help="display the overall statistics (default is False)", default=False,  action='store_true')
 parser.add_argument('-o', help="output format (default is csv), json", default="csv")
-parser.add_argument("-l", help="language used for the analysis (default is en)", default="en")
+parser.add_argument('-l', help="language used for the analysis (default is en)", default="en")
+parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verbatim. (default is the lematized form)", default=False, action='store_true')
 
 args = parser.parse_args()
 if args.f is None:
@@ -51,11 +52,17 @@ redisdb.hset("stats", "token", doc.__len__())
 
 for token in doc:
         if token.pos_ == "VERB" and not token.is_oov:
-            redisdb.zincrby("verb:napkin", 1, token.lemma_)
+            if not args.verbatim:
+                redisdb.zincrby("verb:napkin", 1, token.lemma_)
+            else:
+                redisdb.zincrby("verb:napkin", 1, token.text)
             redisdb.hincrby("stats", "verb:napkin", 1)
             continue
         if token.pos_ == "NOUN" and not token.is_oov:
-            redisdb.zincrby("noun:napkin", 1, token.lemma_)
+            if not args.verbatim:
+                redisdb.zincrby("noun:napkin", 1, token.lemma_)
+            else:
+                redisdb.zincrby("noun:napkin", 1, token.text)
             redisdb.hincrby("stats", "noun:napkin", 1)
             continue