chg: [cli] black the napkin binary

This commit is contained in:
Alexandre Dulaunoy 2024-02-25 15:53:15 +01:00
parent a2a074436e
commit 8541ae3192
Signed by: adulau
GPG key ID: 09E2CD4944E6CBCD

View file

@ -15,21 +15,78 @@ version = "0.9"
parser = argparse.ArgumentParser(description="Extract statistical analysis of text") parser = argparse.ArgumentParser(description="Extract statistical analysis of text")
parser.add_argument('-v', help="verbose output") parser.add_argument('-v', help="verbose output")
parser.add_argument('-f', help="file to analyse") parser.add_argument('-f', help="file to analyse")
parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100) parser.add_argument(
parser.add_argument('-s', help="display the overall statistics (default is False)", default=False, action='store_true') '-t',
parser.add_argument('-o', help="output format (default is csv), json, readable", default="csv") help="maximum value for the top list (default is 100) -1 is no limit",
parser.add_argument('-l', help="language used for the analysis (default is en)", default="en") default=100,
parser.add_argument('-i', help="Use stdin instead of a filename", default=False, action='store_true') )
parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verbatim. (default is the lematized form)", default=False, action='store_true') parser.add_argument(
parser.add_argument('--no-flushdb', help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)", default=False, action='store_true') '-s',
parser.add_argument('--binary', help="set output in binary instead of UTF-8 (default)", default=False, action='store_true') help="display the overall statistics (default is False)",
parser.add_argument('--analysis', help="Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed)", default='all') default=False,
parser.add_argument('--disable-parser', help="disable parser component in Spacy", default=False, action='store_true') action='store_true',
parser.add_argument('--disable-tagger', help="disable tagger component in Spacy", default=False, action='store_true') )
parser.add_argument('--token-span', default= None, help='Find the sentences where a specific token is located') parser.add_argument(
parser.add_argument('--table-format', help="set tabulate format (default is fancy_grid)", default="fancy_grid") '-o', help="output format (default is csv), json, readable", default="csv"
parser.add_argument('--full-labels', help="store each label value in a ranked set (default is False)", action='store_true', default=False) )
#parser.add_argument('--geolocation', help="export geolocation (default is False)", action='store_true', default=False) parser.add_argument(
'-l', help="language used for the analysis (default is en)", default="en"
)
parser.add_argument(
'-i', help="Use stdin instead of a filename", default=False, action='store_true'
)
parser.add_argument(
'--verbatim',
help="Don't use the lemmatized form, use verbatim. (default is the lematized form)",
default=False,
action='store_true',
)
parser.add_argument(
'--no-flushdb',
help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)",
default=False,
action='store_true',
)
parser.add_argument(
'--binary',
help="set output in binary instead of UTF-8 (default)",
default=False,
action='store_true',
)
parser.add_argument(
'--analysis',
help="Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed)",
default='all',
)
parser.add_argument(
'--disable-parser',
help="disable parser component in Spacy",
default=False,
action='store_true',
)
parser.add_argument(
'--disable-tagger',
help="disable tagger component in Spacy",
default=False,
action='store_true',
)
parser.add_argument(
'--token-span',
default=None,
help='Find the sentences where a specific token is located',
)
parser.add_argument(
'--table-format',
help="set tabulate format (default is fancy_grid)",
default="fancy_grid",
)
parser.add_argument(
'--full-labels',
help="store each label value in a ranked set (default is False)",
action='store_true',
default=False,
)
# parser.add_argument('--geolocation', help="export geolocation (default is False)", action='store_true', default=False)
args = parser.parse_args() args = parser.parse_args()
@ -37,11 +94,13 @@ if args.f is None and not args.i:
parser.print_help() parser.print_help()
sys.exit() sys.exit()
#if args.geolocation: # if args.geolocation:
# args.full_labels = True # args.full_labels = True
if not args.binary: if not args.binary:
redisdb = redis.Redis(host="localhost", port=6379, db=5, encoding='utf-8', decode_responses=True) redisdb = redis.Redis(
host="localhost", port=6379, db=5, encoding='utf-8', decode_responses=True
)
else: else:
redisdb = redis.Redis(host="localhost", port=6379, db=5) redisdb = redis.Redis(host="localhost", port=6379, db=5)
@ -61,7 +120,7 @@ if args.disable_tagger:
disable.append("tagger") disable.append("tagger")
if args.l == "fr": if args.l == "fr":
try : try:
nlp = spacy.load("fr_core_news_md", disable=disable) nlp = spacy.load("fr_core_news_md", disable=disable)
except: except:
print("Downloading missing model") print("Downloading missing model")
@ -90,20 +149,52 @@ if args.i:
detect_lang = cld3.get_language(text) detect_lang = cld3.get_language(text)
if detect_lang[0] != args.l: if detect_lang[0] != args.l:
sys.exit("Language detected ({}) is different than the NLP used ({})".format(detect_lang[0], args.l)) sys.exit(
"Language detected ({}) is different than the NLP used ({})".format(
detect_lang[0], args.l
)
)
doc = nlp(text) doc = nlp(text)
analysis = ["verb", "noun", "hashtag", "mention", analysis = [
"digit", "url", "oov", "labels", "verb",
"punct", "email"] "noun",
"hashtag",
"mention",
"digit",
"url",
"oov",
"labels",
"punct",
"email",
]
if args.token_span and not disable: if args.token_span and not disable:
analysis.append("span") analysis.append("span")
redisdb.hset("stats", "token", doc.__len__()) redisdb.hset("stats", "token", doc.__len__())
labels = [ "EVENT", "PERCENT", "MONEY", "FAC", "TIME", "QUANTITY", "WORK_OF_ART", "LANGUAGE", "PRODUCT", "LOC", "LAW", "DATE", "ORDINAL", "NORP", "ORG", "CARDINAL", "GPE", "PERSON"] labels = [
"EVENT",
"PERCENT",
"MONEY",
"FAC",
"TIME",
"QUANTITY",
"WORK_OF_ART",
"LANGUAGE",
"PRODUCT",
"LOC",
"LAW",
"DATE",
"ORDINAL",
"NORP",
"ORG",
"CARDINAL",
"GPE",
"PERSON",
]
for entity in doc.ents: for entity in doc.ents:
redisdb.zincrby("labels", 1, entity.label_) redisdb.zincrby("labels", 1, entity.label_)
@ -165,7 +256,7 @@ for token in doc:
if args.o == "json": if args.o == "json":
output_json = {"format":"napkin", "version": version} output_json = {"format": "napkin", "version": version}
for anal in analysis: for anal in analysis:
more_info = "" more_info = ""
if args.analysis == "all" or args.analysis == anal: if args.analysis == "all" or args.analysis == anal:
@ -183,10 +274,10 @@ for anal in analysis:
header = ["\033[1mTop {} of {} {}\033[0m".format(args.t, anal, more_info)] header = ["\033[1mTop {} of {} {}\033[0m".format(args.t, anal, more_info)]
readable_table = [] readable_table = []
elif args.o == "json": elif args.o == "json":
output_json.update({anal:[]}) output_json.update({anal: []})
for a in x: for a in x:
if args.o == "csv": if args.o == "csv":
print("{},{},{}".format(anal,a[0],a[1])) print("{},{},{}".format(anal, a[0], a[1]))
elif args.o == "readable": elif args.o == "readable":
if previous_value == a[1]: if previous_value == a[1]:
readable_table.append(["{}".format(a[0])]) readable_table.append(["{}".format(a[0])])