chg: [langdetect] detection of language before further processing

Before processing the text, we use cld3 to detect the language and compare if the foreseen spacy model to be used.
2024-11-21 17:37:07 +00:00 · 2020-10-09 20:47:43 +02:00 · 2020-10-09 20:47:43 +02:00 · 793e7ae9c5
commit 793e7ae9c5
parent 98a8d8275e
1 changed files with 8 additions and 4 deletions
--- a/bin/napkin.py
+++ b/bin/napkin.py
@ -3,11 +3,11 @@
 import redis
 import spacy
 from spacy_langdetect import LanguageDetector
 import argparse
 import sys
 import simplejson as json
 from tabulate import tabulate
 import cld3
 parser = argparse.ArgumentParser(description="Extract statistical analysis of text")
 parser.add_argument('-v', help="verbose output")
@ -41,16 +41,20 @@ if not args.no_flushdb:
 if args.l == "fr":
    nlp = spacy.load("fr_core_news_md")
-else:
+elif args.l == "en":
    nlp = spacy.load("en_core_web_md")
-
+else:
-nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
+    sys.exit("Language not supported")
 nlp.max_length = 2000000
 with open(args.f, 'r') as file:
    text = file.read()
 detect_lang = cld3.get_language(text)
 if detect_lang[0] != args.l:
    sys.exit("Language detected ({}) is different than the NLP used ({})".format(detect_lang[0], args.l))
 doc = nlp(text)
 analysis = ["verb", "noun", "hashtag", "mention",