mirror of
https://github.com/adulau/napkin-text-analysis.git
synced 2024-11-21 17:37:07 +00:00
chg: [cli] black the napkin binary
This commit is contained in:
parent
a2a074436e
commit
8541ae3192
1 changed files with 198 additions and 107 deletions
305
bin/napkin.py
305
bin/napkin.py
|
@ -15,21 +15,78 @@ version = "0.9"
|
||||||
parser = argparse.ArgumentParser(description="Extract statistical analysis of text")
|
parser = argparse.ArgumentParser(description="Extract statistical analysis of text")
|
||||||
parser.add_argument('-v', help="verbose output")
|
parser.add_argument('-v', help="verbose output")
|
||||||
parser.add_argument('-f', help="file to analyse")
|
parser.add_argument('-f', help="file to analyse")
|
||||||
parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100)
|
parser.add_argument(
|
||||||
parser.add_argument('-s', help="display the overall statistics (default is False)", default=False, action='store_true')
|
'-t',
|
||||||
parser.add_argument('-o', help="output format (default is csv), json, readable", default="csv")
|
help="maximum value for the top list (default is 100) -1 is no limit",
|
||||||
parser.add_argument('-l', help="language used for the analysis (default is en)", default="en")
|
default=100,
|
||||||
parser.add_argument('-i', help="Use stdin instead of a filename", default=False, action='store_true')
|
)
|
||||||
parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verbatim. (default is the lematized form)", default=False, action='store_true')
|
parser.add_argument(
|
||||||
parser.add_argument('--no-flushdb', help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)", default=False, action='store_true')
|
'-s',
|
||||||
parser.add_argument('--binary', help="set output in binary instead of UTF-8 (default)", default=False, action='store_true')
|
help="display the overall statistics (default is False)",
|
||||||
parser.add_argument('--analysis', help="Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed)", default='all')
|
default=False,
|
||||||
parser.add_argument('--disable-parser', help="disable parser component in Spacy", default=False, action='store_true')
|
action='store_true',
|
||||||
parser.add_argument('--disable-tagger', help="disable tagger component in Spacy", default=False, action='store_true')
|
)
|
||||||
parser.add_argument('--token-span', default= None, help='Find the sentences where a specific token is located')
|
parser.add_argument(
|
||||||
parser.add_argument('--table-format', help="set tabulate format (default is fancy_grid)", default="fancy_grid")
|
'-o', help="output format (default is csv), json, readable", default="csv"
|
||||||
parser.add_argument('--full-labels', help="store each label value in a ranked set (default is False)", action='store_true', default=False)
|
)
|
||||||
#parser.add_argument('--geolocation', help="export geolocation (default is False)", action='store_true', default=False)
|
parser.add_argument(
|
||||||
|
'-l', help="language used for the analysis (default is en)", default="en"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-i', help="Use stdin instead of a filename", default=False, action='store_true'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--verbatim',
|
||||||
|
help="Don't use the lemmatized form, use verbatim. (default is the lematized form)",
|
||||||
|
default=False,
|
||||||
|
action='store_true',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--no-flushdb',
|
||||||
|
help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)",
|
||||||
|
default=False,
|
||||||
|
action='store_true',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--binary',
|
||||||
|
help="set output in binary instead of UTF-8 (default)",
|
||||||
|
default=False,
|
||||||
|
action='store_true',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--analysis',
|
||||||
|
help="Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed)",
|
||||||
|
default='all',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--disable-parser',
|
||||||
|
help="disable parser component in Spacy",
|
||||||
|
default=False,
|
||||||
|
action='store_true',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--disable-tagger',
|
||||||
|
help="disable tagger component in Spacy",
|
||||||
|
default=False,
|
||||||
|
action='store_true',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--token-span',
|
||||||
|
default=None,
|
||||||
|
help='Find the sentences where a specific token is located',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--table-format',
|
||||||
|
help="set tabulate format (default is fancy_grid)",
|
||||||
|
default="fancy_grid",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--full-labels',
|
||||||
|
help="store each label value in a ranked set (default is False)",
|
||||||
|
action='store_true',
|
||||||
|
default=False,
|
||||||
|
)
|
||||||
|
# parser.add_argument('--geolocation', help="export geolocation (default is False)", action='store_true', default=False)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
@ -37,11 +94,13 @@ if args.f is None and not args.i:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
#if args.geolocation:
|
# if args.geolocation:
|
||||||
# args.full_labels = True
|
# args.full_labels = True
|
||||||
|
|
||||||
if not args.binary:
|
if not args.binary:
|
||||||
redisdb = redis.Redis(host="localhost", port=6379, db=5, encoding='utf-8', decode_responses=True)
|
redisdb = redis.Redis(
|
||||||
|
host="localhost", port=6379, db=5, encoding='utf-8', decode_responses=True
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
redisdb = redis.Redis(host="localhost", port=6379, db=5)
|
redisdb = redis.Redis(host="localhost", port=6379, db=5)
|
||||||
|
|
||||||
|
@ -61,7 +120,7 @@ if args.disable_tagger:
|
||||||
disable.append("tagger")
|
disable.append("tagger")
|
||||||
|
|
||||||
if args.l == "fr":
|
if args.l == "fr":
|
||||||
try :
|
try:
|
||||||
nlp = spacy.load("fr_core_news_md", disable=disable)
|
nlp = spacy.load("fr_core_news_md", disable=disable)
|
||||||
except:
|
except:
|
||||||
print("Downloading missing model")
|
print("Downloading missing model")
|
||||||
|
@ -90,116 +149,148 @@ if args.i:
|
||||||
|
|
||||||
detect_lang = cld3.get_language(text)
|
detect_lang = cld3.get_language(text)
|
||||||
if detect_lang[0] != args.l:
|
if detect_lang[0] != args.l:
|
||||||
sys.exit("Language detected ({}) is different than the NLP used ({})".format(detect_lang[0], args.l))
|
sys.exit(
|
||||||
|
"Language detected ({}) is different than the NLP used ({})".format(
|
||||||
|
detect_lang[0], args.l
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
|
|
||||||
analysis = ["verb", "noun", "hashtag", "mention",
|
analysis = [
|
||||||
"digit", "url", "oov", "labels",
|
"verb",
|
||||||
"punct", "email"]
|
"noun",
|
||||||
|
"hashtag",
|
||||||
|
"mention",
|
||||||
|
"digit",
|
||||||
|
"url",
|
||||||
|
"oov",
|
||||||
|
"labels",
|
||||||
|
"punct",
|
||||||
|
"email",
|
||||||
|
]
|
||||||
|
|
||||||
if args.token_span and not disable:
|
if args.token_span and not disable:
|
||||||
analysis.append("span")
|
analysis.append("span")
|
||||||
|
|
||||||
redisdb.hset("stats", "token", doc.__len__())
|
redisdb.hset("stats", "token", doc.__len__())
|
||||||
|
|
||||||
labels = [ "EVENT", "PERCENT", "MONEY", "FAC", "TIME", "QUANTITY", "WORK_OF_ART", "LANGUAGE", "PRODUCT", "LOC", "LAW", "DATE", "ORDINAL", "NORP", "ORG", "CARDINAL", "GPE", "PERSON"]
|
labels = [
|
||||||
|
"EVENT",
|
||||||
|
"PERCENT",
|
||||||
|
"MONEY",
|
||||||
|
"FAC",
|
||||||
|
"TIME",
|
||||||
|
"QUANTITY",
|
||||||
|
"WORK_OF_ART",
|
||||||
|
"LANGUAGE",
|
||||||
|
"PRODUCT",
|
||||||
|
"LOC",
|
||||||
|
"LAW",
|
||||||
|
"DATE",
|
||||||
|
"ORDINAL",
|
||||||
|
"NORP",
|
||||||
|
"ORG",
|
||||||
|
"CARDINAL",
|
||||||
|
"GPE",
|
||||||
|
"PERSON",
|
||||||
|
]
|
||||||
|
|
||||||
for entity in doc.ents:
|
for entity in doc.ents:
|
||||||
redisdb.zincrby("labels", 1, entity.label_)
|
redisdb.zincrby("labels", 1, entity.label_)
|
||||||
if not args.full_labels:
|
if not args.full_labels:
|
||||||
continue
|
continue
|
||||||
if entity.label_ in labels:
|
if entity.label_ in labels:
|
||||||
redisdb.zincrby("label:{}".format(entity.label_), 1, entity.text)
|
redisdb.zincrby("label:{}".format(entity.label_), 1, entity.text)
|
||||||
|
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if args.token_span is not None and not disable:
|
if args.token_span is not None and not disable:
|
||||||
if token.text == args.token_span:
|
if token.text == args.token_span:
|
||||||
redisdb.zincrby("span", 1, token.sent.as_doc().text)
|
redisdb.zincrby("span", 1, token.sent.as_doc().text)
|
||||||
if token.pos_ == "VERB" and not token.is_oov and len(token) > 1:
|
if token.pos_ == "VERB" and not token.is_oov and len(token) > 1:
|
||||||
if not args.verbatim:
|
if not args.verbatim:
|
||||||
redisdb.zincrby("verb", 1, token.lemma_)
|
redisdb.zincrby("verb", 1, token.lemma_)
|
||||||
else:
|
else:
|
||||||
redisdb.zincrby("verb", 1, token.text)
|
redisdb.zincrby("verb", 1, token.text)
|
||||||
redisdb.hincrby("stats", "verb", 1)
|
redisdb.hincrby("stats", "verb", 1)
|
||||||
continue
|
continue
|
||||||
if token.pos_ == "NOUN" and not token.is_oov and len(token) > 1:
|
if token.pos_ == "NOUN" and not token.is_oov and len(token) > 1:
|
||||||
if not args.verbatim:
|
if not args.verbatim:
|
||||||
redisdb.zincrby("noun", 1, token.lemma_)
|
redisdb.zincrby("noun", 1, token.lemma_)
|
||||||
else:
|
else:
|
||||||
redisdb.zincrby("noun", 1, token.text)
|
redisdb.zincrby("noun", 1, token.text)
|
||||||
redisdb.hincrby("stats", "noun", 1)
|
redisdb.hincrby("stats", "noun", 1)
|
||||||
continue
|
continue
|
||||||
if token.pos_ == "PUNCT" and not token.is_oov:
|
if token.pos_ == "PUNCT" and not token.is_oov:
|
||||||
redisdb.zincrby("punct", 1, "{}".format(token))
|
redisdb.zincrby("punct", 1, "{}".format(token))
|
||||||
redisdb.hincrby("stats", "punct", 1)
|
redisdb.hincrby("stats", "punct", 1)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if token.is_oov:
|
if token.is_oov:
|
||||||
value = "{}".format(token)
|
value = "{}".format(token)
|
||||||
if value.startswith('#'):
|
if value.startswith('#'):
|
||||||
redisdb.zincrby("hashtag", 1, value[1:])
|
redisdb.zincrby("hashtag", 1, value[1:])
|
||||||
redisdb.hincrby("stats", "hashtag", 1)
|
redisdb.hincrby("stats", "hashtag", 1)
|
||||||
continue
|
continue
|
||||||
if value.startswith('@'):
|
if value.startswith('@'):
|
||||||
redisdb.zincrby("mention", 1, value[1:])
|
redisdb.zincrby("mention", 1, value[1:])
|
||||||
redisdb.hincrby("stats", "mention", 1)
|
redisdb.hincrby("stats", "mention", 1)
|
||||||
continue
|
continue
|
||||||
if token.is_digit:
|
if token.is_digit:
|
||||||
redisdb.zincrby("digit", 1, value)
|
redisdb.zincrby("digit", 1, value)
|
||||||
redisdb.hincrby("stats", "digit", 1)
|
redisdb.hincrby("stats", "digit", 1)
|
||||||
continue
|
continue
|
||||||
if token.is_space:
|
if token.is_space:
|
||||||
redisdb.hincrby("stats", "space", 1)
|
redisdb.hincrby("stats", "space", 1)
|
||||||
continue
|
continue
|
||||||
if token.like_url:
|
if token.like_url:
|
||||||
redisdb.zincrby("url", 1, value)
|
redisdb.zincrby("url", 1, value)
|
||||||
redisdb.hincrby("stats", "url", 1)
|
redisdb.hincrby("stats", "url", 1)
|
||||||
continue
|
continue
|
||||||
if token.like_email:
|
if token.like_email:
|
||||||
redisdb.zincrby("email", 1, value)
|
redisdb.zincrby("email", 1, value)
|
||||||
redisdb.hincrby("stats", "email", 1)
|
redisdb.hincrby("stats", "email", 1)
|
||||||
continue
|
continue
|
||||||
redisdb.zincrby("oov", 1, value)
|
redisdb.zincrby("oov", 1, value)
|
||||||
redisdb.hincrby("stats", "oov", 1)
|
redisdb.hincrby("stats", "oov", 1)
|
||||||
|
|
||||||
|
|
||||||
if args.o == "json":
|
if args.o == "json":
|
||||||
output_json = {"format":"napkin", "version": version}
|
output_json = {"format": "napkin", "version": version}
|
||||||
for anal in analysis:
|
for anal in analysis:
|
||||||
more_info = ""
|
more_info = ""
|
||||||
if args.analysis == "all" or args.analysis == anal:
|
if args.analysis == "all" or args.analysis == anal:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
if anal == "span":
|
if anal == "span":
|
||||||
more_info = "for {}".format(args.token_span)
|
more_info = "for {}".format(args.token_span)
|
||||||
if args.o == "readable":
|
if args.o == "readable":
|
||||||
previous_value = None
|
previous_value = None
|
||||||
x = redisdb.zrevrange(anal, 0, args.t, withscores=True, score_cast_func=int)
|
x = redisdb.zrevrange(anal, 0, args.t, withscores=True, score_cast_func=int)
|
||||||
|
if args.o == "csv":
|
||||||
|
print()
|
||||||
|
elif args.o == "readable":
|
||||||
|
header = ["\033[1mTop {} of {} {}\033[0m".format(args.t, anal, more_info)]
|
||||||
|
readable_table = []
|
||||||
|
elif args.o == "json":
|
||||||
|
output_json.update({anal: []})
|
||||||
|
for a in x:
|
||||||
if args.o == "csv":
|
if args.o == "csv":
|
||||||
print()
|
print("{},{},{}".format(anal, a[0], a[1]))
|
||||||
elif args.o == "readable":
|
elif args.o == "readable":
|
||||||
header = ["\033[1mTop {} of {} {}\033[0m".format(args.t, anal, more_info)]
|
if previous_value == a[1]:
|
||||||
readable_table = []
|
readable_table.append(["{}".format(a[0])])
|
||||||
|
elif previous_value is None or a[1] < previous_value:
|
||||||
|
previous_value = a[1]
|
||||||
|
readable_table.append(["{} occurences".format(a[1])])
|
||||||
|
readable_table.append(["{}".format(a[0])])
|
||||||
elif args.o == "json":
|
elif args.o == "json":
|
||||||
output_json.update({anal:[]})
|
output_json[anal].append(a)
|
||||||
for a in x:
|
if args.o == "readable":
|
||||||
if args.o == "csv":
|
print(tabulate(readable_table, header, tablefmt=args.table_format))
|
||||||
print("{},{},{}".format(anal,a[0],a[1]))
|
if args.o == "csv":
|
||||||
elif args.o == "readable":
|
print("#")
|
||||||
if previous_value == a[1]:
|
|
||||||
readable_table.append(["{}".format(a[0])])
|
|
||||||
elif previous_value is None or a[1] < previous_value:
|
|
||||||
previous_value = a[1]
|
|
||||||
readable_table.append(["{} occurences".format(a[1])])
|
|
||||||
readable_table.append(["{}".format(a[0])])
|
|
||||||
elif args.o == "json":
|
|
||||||
output_json[anal].append(a)
|
|
||||||
if args.o == "readable":
|
|
||||||
print(tabulate(readable_table, header, tablefmt=args.table_format))
|
|
||||||
if args.o == "csv":
|
|
||||||
print("#")
|
|
||||||
|
|
||||||
if args.s:
|
if args.s:
|
||||||
print(redisdb.hgetall('stats'))
|
print(redisdb.hgetall('stats'))
|
||||||
|
|
Loading…
Reference in a new issue