new: [option] --analysis to limit the output to a specific analysis

2024-11-22 01:47:06 +00:00 · 2020-10-09 23:23:36 +02:00 · 2020-10-09 23:23:36 +02:00 · 24e69a8ad9
commit 24e69a8ad9
parent 32a899a4a0
2 changed files with 24 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -31,24 +31,28 @@ Intermediate results are stored in a Redis database to allow the analysis of mul

 ~~~~
 usage: napkin.py [-h] [-v V] [-f F] [-t T] [-s] [-o O] [-l L] [--verbatim]
-                 [--no-flushdb] [--binary]
+                 [--no-flushdb] [--binary] [--analysis ANALYSIS]

 Extract statistical analysis of text

 optional arguments:
-  -h, --help    show this help message and exit
-  -v V          verbose output
-  -f F          file to analyse
-  -t T          maximum value for the top list (default is 100) -1 is no limit
-  -s            display the overall statistics (default is False)
-  -o O          output format (default is csv), json, readable
-  -l L          language used for the analysis (default is en)
-  --verbatim    Don't use the lemmatized form, use verbatim. (default is the
-                lematized form)
-  --no-flushdb  Don't flush the redisdb, useful when you want to process
-                multiple files and aggregate the results. (by default the
-                redis database is flushed at each run)
-  --binary      Output response in binary instead of UTF-8 (default)
+  -h, --help           show this help message and exit
+  -v V                 verbose output
+  -f F                 file to analyse
+  -t T                 maximum value for the top list (default is 100) -1 is
+                       no limit
+  -s                   display the overall statistics (default is False)
+  -o O                 output format (default is csv), json, readable
+  -l L                 language used for the analysis (default is en)
+  --verbatim           Don't use the lemmatized form, use verbatim. (default
+                       is the lematized form)
+  --no-flushdb         Don't flush the redisdb, useful when you want to
+                       process multiple files and aggregate the results. (by
+                       default the redis database is flushed at each run)
+  --binary             set output in binary instead of UTF-8 (default)
+  --analysis ANALYSIS  Limit output to a specific analysis (verb, noun,
+                       hashtag, mention, digit, url, oov, labels, punct).
+                       (Default is all analysis are displayed)
 ~~~~

 # example usage of napkin
--- a/bin/napkin.py
+++ b/bin/napkin.py
@ -18,7 +18,8 @@ parser.add_argument('-o', help="output format (default is csv), json, readable",
 parser.add_argument('-l', help="language used for the analysis (default is en)", default="en")
 parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verbatim. (default is the lematized form)", default=False, action='store_true')
 parser.add_argument('--no-flushdb', help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)", default=False, action='store_true')
-parser.add_argument('--binary', help="Output response in binary instead of UTF-8 (default)", default=False, action='store_true')
+parser.add_argument('--binary', help="set output in binary instead of UTF-8 (default)", default=False, action='store_true')
+parser.add_argument('--analysis', help="Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed)", default='all')

 args = parser.parse_args()
 if args.f is None:
@ -118,6 +119,10 @@ for entity in doc.ents:
 if args.o == "json":
    output_json = {"format":"napkin"}
 for anal in analysis:
+        if args.analysis == "all" or args.analysis == anal:
+            pass
+        else:
+            continue
        if args.o == "readable":
            previous_value = None
        x = redisdb.zrevrange(anal, 1, args.t, withscores=True, score_cast_func=int)