chg: [cli] black the napkin binary

2024-11-21 17:37:07 +00:00 · 2024-02-25 15:53:15 +01:00 · 2024-02-25 15:53:15 +01:00 · 8541ae3192
commit 8541ae3192
parent a2a074436e
1 changed files with 198 additions and 107 deletions
--- a/bin/napkin.py
+++ b/bin/napkin.py
@ -15,21 +15,78 @@ version = "0.9"
 parser = argparse.ArgumentParser(description="Extract statistical analysis of text")
 parser.add_argument('-v', help="verbose output")
 parser.add_argument('-f', help="file to analyse")
-parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100)
+parser.add_argument(
-parser.add_argument('-s', help="display the overall statistics (default is False)", default=False,  action='store_true')
+    '-t',
-parser.add_argument('-o', help="output format (default is csv), json, readable", default="csv")
+    help="maximum value for the top list (default is 100) -1 is no limit",
-parser.add_argument('-l', help="language used for the analysis (default is en)", default="en")
+    default=100,
-parser.add_argument('-i', help="Use stdin instead of a filename", default=False, action='store_true')
+)
-parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verbatim. (default is the lematized form)", default=False, action='store_true')
+parser.add_argument(
-parser.add_argument('--no-flushdb', help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)", default=False, action='store_true')
+    '-s',
-parser.add_argument('--binary', help="set output in binary instead of UTF-8 (default)", default=False, action='store_true')
+    help="display the overall statistics (default is False)",
-parser.add_argument('--analysis', help="Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed)", default='all')
+    default=False,
-parser.add_argument('--disable-parser', help="disable parser component in Spacy", default=False, action='store_true')
+    action='store_true',
-parser.add_argument('--disable-tagger', help="disable tagger component in Spacy", default=False, action='store_true')
+)
-parser.add_argument('--token-span', default= None, help='Find the sentences where a specific token is located')
+parser.add_argument(
-parser.add_argument('--table-format', help="set tabulate format (default is fancy_grid)", default="fancy_grid")
+    '-o', help="output format (default is csv), json, readable", default="csv"
-parser.add_argument('--full-labels', help="store each label value in a ranked set (default is False)", action='store_true', default=False)
+)
-#parser.add_argument('--geolocation', help="export geolocation (default is False)", action='store_true', default=False)
+parser.add_argument(
    '-l', help="language used for the analysis (default is en)", default="en"
 )
 parser.add_argument(
    '-i', help="Use stdin instead of a filename", default=False, action='store_true'
 )
 parser.add_argument(
    '--verbatim',
    help="Don't use the lemmatized form, use verbatim. (default is the lematized form)",
    default=False,
    action='store_true',
 )
 parser.add_argument(
    '--no-flushdb',
    help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)",
    default=False,
    action='store_true',
 )
 parser.add_argument(
    '--binary',
    help="set output in binary instead of UTF-8 (default)",
    default=False,
    action='store_true',
 )
 parser.add_argument(
    '--analysis',
    help="Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed)",
    default='all',
 )
 parser.add_argument(
    '--disable-parser',
    help="disable parser component in Spacy",
    default=False,
    action='store_true',
 )
 parser.add_argument(
    '--disable-tagger',
    help="disable tagger component in Spacy",
    default=False,
    action='store_true',
 )
 parser.add_argument(
    '--token-span',
    default=None,
    help='Find the sentences where a specific token is located',
 )
 parser.add_argument(
    '--table-format',
    help="set tabulate format (default is fancy_grid)",
    default="fancy_grid",
 )
 parser.add_argument(
    '--full-labels',
    help="store each label value in a ranked set (default is False)",
    action='store_true',
    default=False,
 )
 # parser.add_argument('--geolocation', help="export geolocation (default is False)", action='store_true', default=False)
 args = parser.parse_args()
@ -37,11 +94,13 @@ if args.f is None and not args.i:
    parser.print_help()
    sys.exit()
-#if args.geolocation:
+# if args.geolocation:
 #    args.full_labels = True
 if not args.binary:
-    redisdb = redis.Redis(host="localhost", port=6379, db=5, encoding='utf-8', decode_responses=True)
+    redisdb = redis.Redis(
        host="localhost", port=6379, db=5, encoding='utf-8', decode_responses=True
    )
 else:
    redisdb = redis.Redis(host="localhost", port=6379, db=5)
@ -61,7 +120,7 @@ if args.disable_tagger:
    disable.append("tagger")
 if args.l == "fr":
-    try :
+    try:
        nlp = spacy.load("fr_core_news_md", disable=disable)
    except:
        print("Downloading missing model")
@ -90,20 +149,52 @@ if args.i:
 detect_lang = cld3.get_language(text)
 if detect_lang[0] != args.l:
-    sys.exit("Language detected ({}) is different than the NLP used ({})".format(detect_lang[0], args.l))
+    sys.exit(
        "Language detected ({}) is different than the NLP used ({})".format(
            detect_lang[0], args.l
        )
    )
 doc = nlp(text)
-analysis = ["verb", "noun", "hashtag", "mention",
+analysis = [
-            "digit", "url", "oov", "labels",
+    "verb",
-            "punct", "email"]
+    "noun",
    "hashtag",
    "mention",
    "digit",
    "url",
    "oov",
    "labels",
    "punct",
    "email",
 ]
 if args.token_span and not disable:
    analysis.append("span")
 redisdb.hset("stats", "token", doc.__len__())
-labels = [ "EVENT", "PERCENT", "MONEY", "FAC", "TIME", "QUANTITY", "WORK_OF_ART", "LANGUAGE", "PRODUCT", "LOC", "LAW", "DATE", "ORDINAL", "NORP", "ORG", "CARDINAL", "GPE", "PERSON"]
+labels = [
    "EVENT",
    "PERCENT",
    "MONEY",
    "FAC",
    "TIME",
    "QUANTITY",
    "WORK_OF_ART",
    "LANGUAGE",
    "PRODUCT",
    "LOC",
    "LAW",
    "DATE",
    "ORDINAL",
    "NORP",
    "ORG",
    "CARDINAL",
    "GPE",
    "PERSON",
 ]
 for entity in doc.ents:
    redisdb.zincrby("labels", 1, entity.label_)
@ -165,7 +256,7 @@ for token in doc:
 if args.o == "json":
-    output_json = {"format":"napkin", "version": version}
+    output_json = {"format": "napkin", "version": version}
 for anal in analysis:
    more_info = ""
    if args.analysis == "all" or args.analysis == anal:
@ -183,10 +274,10 @@ for anal in analysis:
        header = ["\033[1mTop {} of {} {}\033[0m".format(args.t, anal, more_info)]
        readable_table = []
    elif args.o == "json":
-            output_json.update({anal:[]})
+        output_json.update({anal: []})
    for a in x:
        if args.o == "csv":
-                print("{},{},{}".format(anal,a[0],a[1]))
+            print("{},{},{}".format(anal, a[0], a[1]))
        elif args.o == "readable":
            if previous_value == a[1]:
                readable_table.append(["{}".format(a[0])])