mirror of
https://github.com/adulau/napkin-text-analysis.git
synced 2024-11-22 01:47:06 +00:00
new: [lemmatized/verbatim] displaying verbatim or lemmatized version is now an option
This commit is contained in:
parent
3d71d9288e
commit
949e41d19f
2 changed files with 17 additions and 5 deletions
|
@ -13,6 +13,8 @@ Analysis features are :
|
||||||
- Mention frequency (everything prefixed with an @ symbol)
|
- Mention frequency (everything prefixed with an @ symbol)
|
||||||
- Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary
|
- Out-Of-Vocabulary (OOV) word frequency meaning any words outside English dictionary
|
||||||
|
|
||||||
|
Verbs and nouns are in their lemmatized form by default but the option `--verbatim` allows to keep the original inflection.
|
||||||
|
|
||||||
# requirements
|
# requirements
|
||||||
|
|
||||||
- Python >= 3.6
|
- Python >= 3.6
|
||||||
|
@ -22,7 +24,7 @@ Analysis features are :
|
||||||
# how to use napkin
|
# how to use napkin
|
||||||
|
|
||||||
~~~~
|
~~~~
|
||||||
usage: napkin.py [-h] [-v V] [-f F] [-t T] [-s] [-o O]
|
usage: napkin.py [-h] [-v V] [-f F] [-t T] [-s] [-o O] [-l L] [--verbatim]
|
||||||
|
|
||||||
Extract statistical analysis of text
|
Extract statistical analysis of text
|
||||||
|
|
||||||
|
@ -32,7 +34,10 @@ optional arguments:
|
||||||
-f F file to analyse
|
-f F file to analyse
|
||||||
-t T maximum value for the top list (default is 100) -1 is no limit
|
-t T maximum value for the top list (default is 100) -1 is no limit
|
||||||
-s display the overall statistics (default is False)
|
-s display the overall statistics (default is False)
|
||||||
-o O output format (default is csv)
|
-o O output format (default is csv), json
|
||||||
|
-l L language used for the analysis (default is en)
|
||||||
|
--verbatim Don't use the lemmatized form, use verbatim. (default is the
|
||||||
|
lematized form)
|
||||||
~~~~
|
~~~~
|
||||||
|
|
||||||
# example usage of napkin
|
# example usage of napkin
|
||||||
|
|
|
@ -14,7 +14,8 @@ parser.add_argument('-f', help="file to analyse")
|
||||||
parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100)
|
parser.add_argument('-t', help="maximum value for the top list (default is 100) -1 is no limit", default=100)
|
||||||
parser.add_argument('-s', help="display the overall statistics (default is False)", default=False, action='store_true')
|
parser.add_argument('-s', help="display the overall statistics (default is False)", default=False, action='store_true')
|
||||||
parser.add_argument('-o', help="output format (default is csv), json", default="csv")
|
parser.add_argument('-o', help="output format (default is csv), json", default="csv")
|
||||||
parser.add_argument("-l", help="language used for the analysis (default is en)", default="en")
|
parser.add_argument('-l', help="language used for the analysis (default is en)", default="en")
|
||||||
|
parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verbatim. (default is the lematized form)", default=False, action='store_true')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.f is None:
|
if args.f is None:
|
||||||
|
@ -51,11 +52,17 @@ redisdb.hset("stats", "token", doc.__len__())
|
||||||
|
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if token.pos_ == "VERB" and not token.is_oov:
|
if token.pos_ == "VERB" and not token.is_oov:
|
||||||
redisdb.zincrby("verb:napkin", 1, token.lemma_)
|
if not args.verbatim:
|
||||||
|
redisdb.zincrby("verb:napkin", 1, token.lemma_)
|
||||||
|
else:
|
||||||
|
redisdb.zincrby("verb:napkin", 1, token.text)
|
||||||
redisdb.hincrby("stats", "verb:napkin", 1)
|
redisdb.hincrby("stats", "verb:napkin", 1)
|
||||||
continue
|
continue
|
||||||
if token.pos_ == "NOUN" and not token.is_oov:
|
if token.pos_ == "NOUN" and not token.is_oov:
|
||||||
redisdb.zincrby("noun:napkin", 1, token.lemma_)
|
if not args.verbatim:
|
||||||
|
redisdb.zincrby("noun:napkin", 1, token.lemma_)
|
||||||
|
else:
|
||||||
|
redisdb.zincrby("noun:napkin", 1, token.text)
|
||||||
redisdb.hincrby("stats", "noun:napkin", 1)
|
redisdb.hincrby("stats", "noun:napkin", 1)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue