From 85044335f4e99712d82249e8725404f694b9ef93 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Sun, 11 Oct 2020 11:04:30 +0200 Subject: [PATCH] new: [option] to disable parser and/or tagger from the standard processing pipeline of Spacy If you don't need any of the syntactic information while using napkin, you can disable parser and tagger. You can gain some memory space and time for processing. By default, it's still active as napkin might use of the syntactic information in the future. --- README.md | 3 +++ bin/napkin.py | 12 ++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 179eb03..aa8bbe9 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ Intermediate results are stored in a Redis database to allow the analysis of mul ~~~~ usage: napkin.py [-h] [-v V] [-f F] [-t T] [-s] [-o O] [-l L] [--verbatim] [--no-flushdb] [--binary] [--analysis ANALYSIS] + [--disable-parser] [--disable-tagger] Extract statistical analysis of text @@ -53,6 +54,8 @@ optional arguments: --analysis ANALYSIS Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed) + --disable-parser disable parser component in Spacy + --disable-tagger disable tagger component in Spacy ~~~~ # example usage of napkin diff --git a/bin/napkin.py b/bin/napkin.py index ec10f85..193e8aa 100644 --- a/bin/napkin.py +++ b/bin/napkin.py @@ -20,6 +20,8 @@ parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verba parser.add_argument('--no-flushdb', help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)", default=False, action='store_true') parser.add_argument('--binary', help="set output in binary instead of UTF-8 (default)", default=False, action='store_true') parser.add_argument('--analysis', help="Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed)", default='all') +parser.add_argument('--disable-parser', help="disable parser component in Spacy", default=False, action='store_true') +parser.add_argument('--disable-tagger', help="disable tagger component in Spacy", default=False, action='store_true') args = parser.parse_args() if args.f is None: @@ -40,10 +42,16 @@ except: if not args.no_flushdb: redisdb.flushdb() +disable = [] +if args.disable_parser: + disable.append("parser") +if args.disable_tagger: + disable.append("tagger") + if args.l == "fr": - nlp = spacy.load("fr_core_news_md") + nlp = spacy.load("fr_core_news_md", disable=disable) elif args.l == "en": - nlp = spacy.load("en_core_web_md") + nlp = spacy.load("en_core_web_md", disable=disable) else: sys.exit("Language not supported")