From 8541ae31925fadf3ed09a35f0f339846bf6a7490 Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy <a@foo.be>
Date: Sun, 25 Feb 2024 15:53:15 +0100
Subject: [PATCH] chg: [cli] black the napkin binary

---
 bin/napkin.py | 305 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 198 insertions(+), 107 deletions(-)

diff --git a/bin/napkin.py b/bin/napkin.py
index 008e4a9..86b9e94 100644
--- a/bin/napkin.py
+++ b/bin/napkin.py
@@ -15,21 +15,78 @@ version = "0.9"
 parser = argparse.ArgumentParser(description="Extract statistical analysis of text")
 parser.add_argument('-v', help="verbose output")
 parser.add_argument('-f', help="file to analyse")
-parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100)
-parser.add_argument('-s', help="display the overall statistics (default is False)", default=False,  action='store_true')
-parser.add_argument('-o', help="output format (default is csv), json, readable", default="csv")
-parser.add_argument('-l', help="language used for the analysis (default is en)", default="en")
-parser.add_argument('-i', help="Use stdin instead of a filename", default=False, action='store_true')
-parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verbatim. (default is the lematized form)", default=False, action='store_true')
-parser.add_argument('--no-flushdb', help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)", default=False, action='store_true')
-parser.add_argument('--binary', help="set output in binary instead of UTF-8 (default)", default=False, action='store_true')
-parser.add_argument('--analysis', help="Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed)", default='all')
-parser.add_argument('--disable-parser', help="disable parser component in Spacy", default=False, action='store_true')
-parser.add_argument('--disable-tagger', help="disable tagger component in Spacy", default=False, action='store_true')
-parser.add_argument('--token-span', default= None, help='Find the sentences where a specific token is located')
-parser.add_argument('--table-format', help="set tabulate format (default is fancy_grid)", default="fancy_grid")
-parser.add_argument('--full-labels', help="store each label value in a ranked set (default is False)", action='store_true', default=False)
-#parser.add_argument('--geolocation', help="export geolocation (default is False)", action='store_true', default=False)
+parser.add_argument(
+    '-t',
+    help="maximum value for the top list (default is 100) -1 is no limit",
+    default=100,
+)
+parser.add_argument(
+    '-s',
+    help="display the overall statistics (default is False)",
+    default=False,
+    action='store_true',
+)
+parser.add_argument(
+    '-o', help="output format (default is csv), json, readable", default="csv"
+)
+parser.add_argument(
+    '-l', help="language used for the analysis (default is en)", default="en"
+)
+parser.add_argument(
+    '-i', help="Use stdin instead of a filename", default=False, action='store_true'
+)
+parser.add_argument(
+    '--verbatim',
+    help="Don't use the lemmatized form, use verbatim. (default is the lematized form)",
+    default=False,
+    action='store_true',
+)
+parser.add_argument(
+    '--no-flushdb',
+    help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)",
+    default=False,
+    action='store_true',
+)
+parser.add_argument(
+    '--binary',
+    help="set output in binary instead of UTF-8 (default)",
+    default=False,
+    action='store_true',
+)
+parser.add_argument(
+    '--analysis',
+    help="Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed)",
+    default='all',
+)
+parser.add_argument(
+    '--disable-parser',
+    help="disable parser component in Spacy",
+    default=False,
+    action='store_true',
+)
+parser.add_argument(
+    '--disable-tagger',
+    help="disable tagger component in Spacy",
+    default=False,
+    action='store_true',
+)
+parser.add_argument(
+    '--token-span',
+    default=None,
+    help='Find the sentences where a specific token is located',
+)
+parser.add_argument(
+    '--table-format',
+    help="set tabulate format (default is fancy_grid)",
+    default="fancy_grid",
+)
+parser.add_argument(
+    '--full-labels',
+    help="store each label value in a ranked set (default is False)",
+    action='store_true',
+    default=False,
+)
+# parser.add_argument('--geolocation', help="export geolocation (default is False)", action='store_true', default=False)
 
 args = parser.parse_args()
 
@@ -37,11 +94,13 @@ if args.f is None and not args.i:
     parser.print_help()
     sys.exit()
 
-#if args.geolocation:
+# if args.geolocation:
 #    args.full_labels = True
 
 if not args.binary:
-    redisdb = redis.Redis(host="localhost", port=6379, db=5, encoding='utf-8', decode_responses=True)
+    redisdb = redis.Redis(
+        host="localhost", port=6379, db=5, encoding='utf-8', decode_responses=True
+    )
 else:
     redisdb = redis.Redis(host="localhost", port=6379, db=5)
 
@@ -61,7 +120,7 @@ if args.disable_tagger:
     disable.append("tagger")
 
 if args.l == "fr":
-    try :
+    try:
         nlp = spacy.load("fr_core_news_md", disable=disable)
     except:
         print("Downloading missing model")
@@ -90,116 +149,148 @@ if args.i:
 
 detect_lang = cld3.get_language(text)
 if detect_lang[0] != args.l:
-    sys.exit("Language detected ({}) is different than the NLP used ({})".format(detect_lang[0], args.l))
+    sys.exit(
+        "Language detected ({}) is different than the NLP used ({})".format(
+            detect_lang[0], args.l
+        )
+    )
 
 doc = nlp(text)
 
-analysis = ["verb", "noun", "hashtag", "mention",
-            "digit", "url", "oov", "labels",
-            "punct", "email"]
+analysis = [
+    "verb",
+    "noun",
+    "hashtag",
+    "mention",
+    "digit",
+    "url",
+    "oov",
+    "labels",
+    "punct",
+    "email",
+]
 
 if args.token_span and not disable:
     analysis.append("span")
 
 redisdb.hset("stats", "token", doc.__len__())
 
-labels = [ "EVENT", "PERCENT", "MONEY", "FAC", "TIME", "QUANTITY", "WORK_OF_ART", "LANGUAGE", "PRODUCT", "LOC", "LAW", "DATE", "ORDINAL", "NORP", "ORG", "CARDINAL", "GPE", "PERSON"]
+labels = [
+    "EVENT",
+    "PERCENT",
+    "MONEY",
+    "FAC",
+    "TIME",
+    "QUANTITY",
+    "WORK_OF_ART",
+    "LANGUAGE",
+    "PRODUCT",
+    "LOC",
+    "LAW",
+    "DATE",
+    "ORDINAL",
+    "NORP",
+    "ORG",
+    "CARDINAL",
+    "GPE",
+    "PERSON",
+]
 
 for entity in doc.ents:
-        redisdb.zincrby("labels", 1, entity.label_)
-        if not args.full_labels:
-            continue
-        if entity.label_ in labels:
-            redisdb.zincrby("label:{}".format(entity.label_), 1, entity.text)
+    redisdb.zincrby("labels", 1, entity.label_)
+    if not args.full_labels:
+        continue
+    if entity.label_ in labels:
+        redisdb.zincrby("label:{}".format(entity.label_), 1, entity.text)
 
 for token in doc:
-        if args.token_span is not None and not disable:
-            if token.text == args.token_span:
-                redisdb.zincrby("span", 1, token.sent.as_doc().text)
-        if token.pos_ == "VERB" and not token.is_oov and len(token) > 1:
-            if not args.verbatim:
-                redisdb.zincrby("verb", 1, token.lemma_)
-            else:
-                redisdb.zincrby("verb", 1, token.text)
-            redisdb.hincrby("stats", "verb", 1)
-            continue
-        if token.pos_ == "NOUN" and not token.is_oov and len(token) > 1:
-            if not args.verbatim:
-                redisdb.zincrby("noun", 1, token.lemma_)
-            else:
-                redisdb.zincrby("noun", 1, token.text)
-            redisdb.hincrby("stats", "noun", 1)
-            continue
-        if token.pos_ == "PUNCT" and not token.is_oov:
-            redisdb.zincrby("punct", 1, "{}".format(token))
-            redisdb.hincrby("stats", "punct", 1)
-            continue
+    if args.token_span is not None and not disable:
+        if token.text == args.token_span:
+            redisdb.zincrby("span", 1, token.sent.as_doc().text)
+    if token.pos_ == "VERB" and not token.is_oov and len(token) > 1:
+        if not args.verbatim:
+            redisdb.zincrby("verb", 1, token.lemma_)
+        else:
+            redisdb.zincrby("verb", 1, token.text)
+        redisdb.hincrby("stats", "verb", 1)
+        continue
+    if token.pos_ == "NOUN" and not token.is_oov and len(token) > 1:
+        if not args.verbatim:
+            redisdb.zincrby("noun", 1, token.lemma_)
+        else:
+            redisdb.zincrby("noun", 1, token.text)
+        redisdb.hincrby("stats", "noun", 1)
+        continue
+    if token.pos_ == "PUNCT" and not token.is_oov:
+        redisdb.zincrby("punct", 1, "{}".format(token))
+        redisdb.hincrby("stats", "punct", 1)
+        continue
 
-        if token.is_oov:
-            value = "{}".format(token)
-            if value.startswith('#'):
-                redisdb.zincrby("hashtag", 1, value[1:])
-                redisdb.hincrby("stats", "hashtag", 1)
-                continue
-            if value.startswith('@'):
-                redisdb.zincrby("mention", 1, value[1:])
-                redisdb.hincrby("stats", "mention", 1)
-                continue
-            if token.is_digit:
-                redisdb.zincrby("digit", 1, value)
-                redisdb.hincrby("stats", "digit", 1)
-                continue
-            if token.is_space:
-                redisdb.hincrby("stats", "space", 1)
-                continue
-            if token.like_url:
-                redisdb.zincrby("url", 1, value)
-                redisdb.hincrby("stats", "url", 1)
-                continue
-            if token.like_email:
-                redisdb.zincrby("email", 1, value)
-                redisdb.hincrby("stats", "email", 1)
-                continue
-            redisdb.zincrby("oov", 1, value)
-            redisdb.hincrby("stats", "oov", 1)
+    if token.is_oov:
+        value = "{}".format(token)
+        if value.startswith('#'):
+            redisdb.zincrby("hashtag", 1, value[1:])
+            redisdb.hincrby("stats", "hashtag", 1)
+            continue
+        if value.startswith('@'):
+            redisdb.zincrby("mention", 1, value[1:])
+            redisdb.hincrby("stats", "mention", 1)
+            continue
+        if token.is_digit:
+            redisdb.zincrby("digit", 1, value)
+            redisdb.hincrby("stats", "digit", 1)
+            continue
+        if token.is_space:
+            redisdb.hincrby("stats", "space", 1)
+            continue
+        if token.like_url:
+            redisdb.zincrby("url", 1, value)
+            redisdb.hincrby("stats", "url", 1)
+            continue
+        if token.like_email:
+            redisdb.zincrby("email", 1, value)
+            redisdb.hincrby("stats", "email", 1)
+            continue
+        redisdb.zincrby("oov", 1, value)
+        redisdb.hincrby("stats", "oov", 1)
 
 
 if args.o == "json":
-    output_json = {"format":"napkin", "version": version}
+    output_json = {"format": "napkin", "version": version}
 for anal in analysis:
-        more_info = ""
-        if args.analysis == "all" or args.analysis == anal:
-            pass
-        else:
-            continue
-        if anal == "span":
-            more_info = "for {}".format(args.token_span)
-        if args.o == "readable":
-            previous_value = None
-        x = redisdb.zrevrange(anal, 0, args.t, withscores=True, score_cast_func=int)
+    more_info = ""
+    if args.analysis == "all" or args.analysis == anal:
+        pass
+    else:
+        continue
+    if anal == "span":
+        more_info = "for {}".format(args.token_span)
+    if args.o == "readable":
+        previous_value = None
+    x = redisdb.zrevrange(anal, 0, args.t, withscores=True, score_cast_func=int)
+    if args.o == "csv":
+        print()
+    elif args.o == "readable":
+        header = ["\033[1mTop {} of {} {}\033[0m".format(args.t, anal, more_info)]
+        readable_table = []
+    elif args.o == "json":
+        output_json.update({anal: []})
+    for a in x:
         if args.o == "csv":
-            print()
+            print("{},{},{}".format(anal, a[0], a[1]))
         elif args.o == "readable":
-            header = ["\033[1mTop {} of {} {}\033[0m".format(args.t, anal, more_info)]
-            readable_table = []
+            if previous_value == a[1]:
+                readable_table.append(["{}".format(a[0])])
+            elif previous_value is None or a[1] < previous_value:
+                previous_value = a[1]
+                readable_table.append(["{} occurences".format(a[1])])
+                readable_table.append(["{}".format(a[0])])
         elif args.o == "json":
-            output_json.update({anal:[]})
-        for a in x:
-            if args.o == "csv":
-                print("{},{},{}".format(anal,a[0],a[1]))
-            elif args.o == "readable":
-                if previous_value == a[1]:
-                    readable_table.append(["{}".format(a[0])])
-                elif previous_value is None or a[1] < previous_value:
-                    previous_value = a[1]
-                    readable_table.append(["{} occurences".format(a[1])])
-                    readable_table.append(["{}".format(a[0])])
-            elif args.o == "json":
-                output_json[anal].append(a)
-        if args.o == "readable":
-            print(tabulate(readable_table, header, tablefmt=args.table_format))
-        if args.o == "csv":
-            print("#")
+            output_json[anal].append(a)
+    if args.o == "readable":
+        print(tabulate(readable_table, header, tablefmt=args.table_format))
+    if args.o == "csv":
+        print("#")
 
 if args.s:
     print(redisdb.hgetall('stats'))