mirror of
https://github.com/adulau/napkin-text-analysis.git
synced 2024-11-24 10:57:07 +00:00
new: [feature] -s option to display the overall statistics of different tokens seen
This commit is contained in:
parent
dd7c796460
commit
526f88071c
2 changed files with 16 additions and 1 deletions
|
@ -22,7 +22,7 @@ Analysis features are :
|
||||||
# how to use napkin
|
# how to use napkin
|
||||||
|
|
||||||
~~~~
|
~~~~
|
||||||
usage: napkin.py [-h] [-v V] [-f F] [-t T] [-o O]
|
usage: napkin.py [-h] [-v V] [-f F] [-t T] [-s] [-o O]
|
||||||
|
|
||||||
Extract statistical analysis of text
|
Extract statistical analysis of text
|
||||||
|
|
||||||
|
@ -31,6 +31,7 @@ optional arguments:
|
||||||
-v V verbose output
|
-v V verbose output
|
||||||
-f F file to analyse
|
-f F file to analyse
|
||||||
-t T maximum value for the top list (default is 100) -1 is no limit
|
-t T maximum value for the top list (default is 100) -1 is no limit
|
||||||
|
-s display the overall statistics (default is False)
|
||||||
-o O output format (default is csv)
|
-o O output format (default is csv)
|
||||||
~~~~
|
~~~~
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@ parser = argparse.ArgumentParser(description="Extract statistical analysis of te
|
||||||
parser.add_argument('-v', help="verbose output")
|
parser.add_argument('-v', help="verbose output")
|
||||||
parser.add_argument('-f', help="file to analyse")
|
parser.add_argument('-f', help="file to analyse")
|
||||||
parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100)
|
parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100)
|
||||||
|
parser.add_argument('-s', help="display the overall statistics (default is False)", default=False, action='store_true')
|
||||||
parser.add_argument('-o', help="output format (default is csv)", default="csv")
|
parser.add_argument('-o', help="output format (default is csv)", default="csv")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.f is None:
|
if args.f is None:
|
||||||
|
@ -37,34 +38,45 @@ doc = nlp(text)
|
||||||
|
|
||||||
analysis = ["verb:napkin", "noun:napkin", "hashtag:napkin", "mention:napkin", "digit:napkin", "url:napking", "oov:napkin", "labels:napkin"]
|
analysis = ["verb:napkin", "noun:napkin", "hashtag:napkin", "mention:napkin", "digit:napkin", "url:napking", "oov:napkin", "labels:napkin"]
|
||||||
|
|
||||||
|
redisdb.hset("stats", "token", doc.__len__())
|
||||||
|
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if token.pos_ == "VERB" and not token.is_oov:
|
if token.pos_ == "VERB" and not token.is_oov:
|
||||||
redisdb.zincrby("verb:napkin", 1, token.lemma_)
|
redisdb.zincrby("verb:napkin", 1, token.lemma_)
|
||||||
|
redisdb.hincrby("stats", "verb:napkin", 1)
|
||||||
continue
|
continue
|
||||||
if token.pos_ == "NOUN" and not token.is_oov:
|
if token.pos_ == "NOUN" and not token.is_oov:
|
||||||
redisdb.zincrby("noun:napkin", 1, token.lemma_)
|
redisdb.zincrby("noun:napkin", 1, token.lemma_)
|
||||||
|
redisdb.hincrby("stats", "noun:napkin", 1)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if token.is_oov:
|
if token.is_oov:
|
||||||
value = "{}".format(token)
|
value = "{}".format(token)
|
||||||
if value.startswith('#'):
|
if value.startswith('#'):
|
||||||
redisdb.zincrby("hashtag:napkin", 1, value[1:])
|
redisdb.zincrby("hashtag:napkin", 1, value[1:])
|
||||||
|
redisdb.hincrby("stats", "hashtag:napkin", 1)
|
||||||
continue
|
continue
|
||||||
if value.startswith('@'):
|
if value.startswith('@'):
|
||||||
redisdb.zincrby("mention:napkin", 1, value[1:])
|
redisdb.zincrby("mention:napkin", 1, value[1:])
|
||||||
|
redisdb.hincrby("stats", "mention:napkin", 1)
|
||||||
continue
|
continue
|
||||||
if token.is_digit:
|
if token.is_digit:
|
||||||
redisdb.zincrby("digit:napkin", 1, value)
|
redisdb.zincrby("digit:napkin", 1, value)
|
||||||
|
redisdb.hincrby("stats", "digit:napkin", 1)
|
||||||
continue
|
continue
|
||||||
if token.is_space:
|
if token.is_space:
|
||||||
|
redisdb.hincrby("stats", "space:napkin", 1)
|
||||||
continue
|
continue
|
||||||
if token.like_url:
|
if token.like_url:
|
||||||
redisdb.zincrby("url:napkin", 1, value)
|
redisdb.zincrby("url:napkin", 1, value)
|
||||||
|
redisdb.hincrby("stats", "url:napkin", 1)
|
||||||
continue
|
continue
|
||||||
if token.like_email:
|
if token.like_email:
|
||||||
redisdb.zincrby("email:napkin", 1, value)
|
redisdb.zincrby("email:napkin", 1, value)
|
||||||
|
redisdb.hincrby("stats", "email:napkin", 1)
|
||||||
continue
|
continue
|
||||||
redisdb.zincrby("oov:napkin", 1, value)
|
redisdb.zincrby("oov:napkin", 1, value)
|
||||||
|
redisdb.hincrby("stats", "oov:napkin", 1)
|
||||||
|
|
||||||
|
|
||||||
for entity in doc.ents:
|
for entity in doc.ents:
|
||||||
|
@ -78,3 +90,5 @@ for anal in analysis:
|
||||||
print ("{},{}".format(a[0],a[1]))
|
print ("{},{}".format(a[0],a[1]))
|
||||||
print ("#")
|
print ("#")
|
||||||
|
|
||||||
|
if args.s:
|
||||||
|
print (redisdb.hgetall('stats'))
|
||||||
|
|
Loading…
Reference in a new issue