From 793e7ae9c52de010c089660dc7ac0c604d2a62d0 Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy <a@foo.be>
Date: Fri, 9 Oct 2020 20:47:43 +0200
Subject: [PATCH] chg: [langdetect] detection of language before further
 processing

Before processing the text, we use cld3 to detect the language
and compare if the foreseen spacy model to be used.
---
 bin/napkin.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/bin/napkin.py b/bin/napkin.py
index ac936ff..7a2e6b9 100644
--- a/bin/napkin.py
+++ b/bin/napkin.py
@@ -3,11 +3,11 @@
 
 import redis
 import spacy
-from spacy_langdetect import LanguageDetector
 import argparse
 import sys
 import simplejson as json
 from tabulate import tabulate
+import cld3
 
 parser = argparse.ArgumentParser(description="Extract statistical analysis of text")
 parser.add_argument('-v', help="verbose output")
@@ -41,16 +41,20 @@ if not args.no_flushdb:
 
 if args.l == "fr":
     nlp = spacy.load("fr_core_news_md")
-else:
+elif args.l == "en":
     nlp = spacy.load("en_core_web_md")
-
-nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
+else:
+    sys.exit("Language not supported")
 
 nlp.max_length = 2000000
 
 with open(args.f, 'r') as file:
     text = file.read()
 
+detect_lang = cld3.get_language(text)
+if detect_lang[0] != args.l:
+    sys.exit("Language detected ({}) is different than the NLP used ({})".format(detect_lang[0], args.l))
+
 doc = nlp(text)
 
 analysis = ["verb", "noun", "hashtag", "mention",