From 793e7ae9c52de010c089660dc7ac0c604d2a62d0 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Fri, 9 Oct 2020 20:47:43 +0200 Subject: [PATCH] chg: [langdetect] detection of language before further processing Before processing the text, we use cld3 to detect the language and compare if the foreseen spacy model to be used. --- bin/napkin.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bin/napkin.py b/bin/napkin.py index ac936ff..7a2e6b9 100644 --- a/bin/napkin.py +++ b/bin/napkin.py @@ -3,11 +3,11 @@ import redis import spacy -from spacy_langdetect import LanguageDetector import argparse import sys import simplejson as json from tabulate import tabulate +import cld3 parser = argparse.ArgumentParser(description="Extract statistical analysis of text") parser.add_argument('-v', help="verbose output") @@ -41,16 +41,20 @@ if not args.no_flushdb: if args.l == "fr": nlp = spacy.load("fr_core_news_md") -else: +elif args.l == "en": nlp = spacy.load("en_core_web_md") - -nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) +else: + sys.exit("Language not supported") nlp.max_length = 2000000 with open(args.f, 'r') as file: text = file.read() +detect_lang = cld3.get_language(text) +if detect_lang[0] != args.l: + sys.exit("Language detected ({}) is different than the NLP used ({})".format(detect_lang[0], args.l)) + doc = nlp(text) analysis = ["verb", "noun", "hashtag", "mention",