mirror of
https://github.com/adulau/napkin-text-analysis.git
synced 2024-11-24 10:57:07 +00:00
new: [option] --token-span to find a specific token in a sentence
This output the sentence where a specific token has been seen. Require parser module of spacy.
This commit is contained in:
parent
85044335f4
commit
42e3094489
2 changed files with 57 additions and 19 deletions
69
README.md
69
README.md
|
@ -33,33 +33,38 @@ Intermediate results are stored in a Redis database to allow the analysis of mul
|
||||||
usage: napkin.py [-h] [-v V] [-f F] [-t T] [-s] [-o O] [-l L] [--verbatim]
|
usage: napkin.py [-h] [-v V] [-f F] [-t T] [-s] [-o O] [-l L] [--verbatim]
|
||||||
[--no-flushdb] [--binary] [--analysis ANALYSIS]
|
[--no-flushdb] [--binary] [--analysis ANALYSIS]
|
||||||
[--disable-parser] [--disable-tagger]
|
[--disable-parser] [--disable-tagger]
|
||||||
|
[--token-span TOKEN_SPAN]
|
||||||
|
|
||||||
Extract statistical analysis of text
|
Extract statistical analysis of text
|
||||||
|
|
||||||
optional arguments:
|
optional arguments:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
-v V verbose output
|
-v V verbose output
|
||||||
-f F file to analyse
|
-f F file to analyse
|
||||||
-t T maximum value for the top list (default is 100) -1 is
|
-t T maximum value for the top list (default is 100) -1 is
|
||||||
no limit
|
no limit
|
||||||
-s display the overall statistics (default is False)
|
-s display the overall statistics (default is False)
|
||||||
-o O output format (default is csv), json, readable
|
-o O output format (default is csv), json, readable
|
||||||
-l L language used for the analysis (default is en)
|
-l L language used for the analysis (default is en)
|
||||||
--verbatim Don't use the lemmatized form, use verbatim. (default
|
--verbatim Don't use the lemmatized form, use verbatim. (default
|
||||||
is the lematized form)
|
is the lematized form)
|
||||||
--no-flushdb Don't flush the redisdb, useful when you want to
|
--no-flushdb Don't flush the redisdb, useful when you want to
|
||||||
process multiple files and aggregate the results. (by
|
process multiple files and aggregate the results. (by
|
||||||
default the redis database is flushed at each run)
|
default the redis database is flushed at each run)
|
||||||
--binary set output in binary instead of UTF-8 (default)
|
--binary set output in binary instead of UTF-8 (default)
|
||||||
--analysis ANALYSIS Limit output to a specific analysis (verb, noun,
|
--analysis ANALYSIS Limit output to a specific analysis (verb, noun,
|
||||||
hashtag, mention, digit, url, oov, labels, punct).
|
hashtag, mention, digit, url, oov, labels, punct).
|
||||||
(Default is all analysis are displayed)
|
(Default is all analysis are displayed)
|
||||||
--disable-parser disable parser component in Spacy
|
--disable-parser disable parser component in Spacy
|
||||||
--disable-tagger disable tagger component in Spacy
|
--disable-tagger disable tagger component in Spacy
|
||||||
|
--token-span TOKEN_SPAN
|
||||||
|
Find the sentences where a specific token is located
|
||||||
~~~~
|
~~~~
|
||||||
|
|
||||||
# example usage of napkin
|
# example usage of napkin
|
||||||
|
|
||||||
|
## Generate all analysis for a given text
|
||||||
|
|
||||||
A sample file "The Prince, by Nicoló Machiavelli" is included to test napkin.
|
A sample file "The Prince, by Nicoló Machiavelli" is included to test napkin.
|
||||||
|
|
||||||
`python3 ./bin/napkin.py -o readable -f samples/the-prince.txt -t 4`
|
`python3 ./bin/napkin.py -o readable -f samples/the-prince.txt -t 4`
|
||||||
|
@ -151,6 +156,32 @@ Example output:
|
||||||
╘═══════════════════╛
|
╘═══════════════════╛
|
||||||
~~~~
|
~~~~
|
||||||
|
|
||||||
|
## Extract the sentences associated to a specific token
|
||||||
|
|
||||||
|
`python3 ./bin/napkin.py -o readable -f samples/the-prince.txt -t 4 --token-span "Vitelli"`
|
||||||
|
|
||||||
|
~~~~
|
||||||
|
╒════════════════════════════════════════════════════════════════════════╕
|
||||||
|
│ Top 4 of span │
|
||||||
|
╞════════════════════════════════════════════════════════════════════════╡
|
||||||
|
│ Nevertheless, Messer Niccolo Vitelli has been seen in │
|
||||||
|
│ our own time to destroy two fortresses in Città di Castello in order │
|
||||||
|
│ to keep that state. │
|
||||||
|
├────────────────────────────────────────────────────────────────────────┤
|
||||||
|
│ And the │
|
||||||
|
│ difference between these forces can be easily seen if one considers │
|
||||||
|
│ the difference between the reputation of the duke when he had only the │
|
||||||
|
│ French, when he had the Orsini and Vitelli, and when he had to rely │
|
||||||
|
│ on himself and his own soldiers. │
|
||||||
|
├────────────────────────────────────────────────────────────────────────┤
|
||||||
|
│ And that his foundations were │
|
||||||
|
│ good is seen from the fact that the Romagna waited for him more than a │
|
||||||
|
│ month; in Rome, although half dead, he remained secure, and although │
|
||||||
|
│ the Baglioni, Vitelli, and Orsini entered Rome they found no followers │
|
||||||
|
│ against him. │
|
||||||
|
╘════════════════════════════════════════════════════════════════════════╛
|
||||||
|
~~~~
|
||||||
|
|
||||||
# what about the name?
|
# what about the name?
|
||||||
|
|
||||||
The name 'napkin' came after a first sketch of the idea on a napkin. The goal was also to provide a simple text analysis tool which can be run on the corner of table in a kitchen.
|
The name 'napkin' came after a first sketch of the idea on a napkin. The goal was also to provide a simple text analysis tool which can be run on the corner of table in a kitchen.
|
||||||
|
|
|
@ -22,6 +22,7 @@ parser.add_argument('--binary', help="set output in binary instead of UTF-8 (def
|
||||||
parser.add_argument('--analysis', help="Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed)", default='all')
|
parser.add_argument('--analysis', help="Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed)", default='all')
|
||||||
parser.add_argument('--disable-parser', help="disable parser component in Spacy", default=False, action='store_true')
|
parser.add_argument('--disable-parser', help="disable parser component in Spacy", default=False, action='store_true')
|
||||||
parser.add_argument('--disable-tagger', help="disable tagger component in Spacy", default=False, action='store_true')
|
parser.add_argument('--disable-tagger', help="disable tagger component in Spacy", default=False, action='store_true')
|
||||||
|
parser.add_argument('--token-span', default= None, help='Find the sentences where a specific token is located')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.f is None:
|
if args.f is None:
|
||||||
|
@ -70,9 +71,15 @@ analysis = ["verb", "noun", "hashtag", "mention",
|
||||||
"digit", "url", "oov", "labels",
|
"digit", "url", "oov", "labels",
|
||||||
"punct"]
|
"punct"]
|
||||||
|
|
||||||
|
if args.token_span and not disable:
|
||||||
|
analysis.append("span")
|
||||||
|
|
||||||
redisdb.hset("stats", "token", doc.__len__())
|
redisdb.hset("stats", "token", doc.__len__())
|
||||||
|
|
||||||
for token in doc:
|
for token in doc:
|
||||||
|
if args.token_span is not None and not disable:
|
||||||
|
if token.text == args.token_span:
|
||||||
|
redisdb.zincrby("span", 1, token.sent.as_doc().text)
|
||||||
if token.pos_ == "VERB" and not token.is_oov and len(token) > 1:
|
if token.pos_ == "VERB" and not token.is_oov and len(token) > 1:
|
||||||
if not args.verbatim:
|
if not args.verbatim:
|
||||||
redisdb.zincrby("verb", 1, token.lemma_)
|
redisdb.zincrby("verb", 1, token.lemma_)
|
||||||
|
|
Loading…
Reference in a new issue