chg: [langdetect] detection of language before further processing

Before processing the text, we use cld3 to detect the language
and compare if the foreseen spacy model to be used.
This commit is contained in:
Alexandre Dulaunoy 2020-10-09 20:47:43 +02:00
parent 98a8d8275e
commit 793e7ae9c5
Signed by: adulau
GPG key ID: 09E2CD4944E6CBCD

View file

@ -3,11 +3,11 @@
import redis import redis
import spacy import spacy
from spacy_langdetect import LanguageDetector
import argparse import argparse
import sys import sys
import simplejson as json import simplejson as json
from tabulate import tabulate from tabulate import tabulate
import cld3
parser = argparse.ArgumentParser(description="Extract statistical analysis of text") parser = argparse.ArgumentParser(description="Extract statistical analysis of text")
parser.add_argument('-v', help="verbose output") parser.add_argument('-v', help="verbose output")
@ -41,16 +41,20 @@ if not args.no_flushdb:
if args.l == "fr": if args.l == "fr":
nlp = spacy.load("fr_core_news_md") nlp = spacy.load("fr_core_news_md")
else: elif args.l == "en":
nlp = spacy.load("en_core_web_md") nlp = spacy.load("en_core_web_md")
else:
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) sys.exit("Language not supported")
nlp.max_length = 2000000 nlp.max_length = 2000000
with open(args.f, 'r') as file: with open(args.f, 'r') as file:
text = file.read() text = file.read()
detect_lang = cld3.get_language(text)
if detect_lang[0] != args.l:
sys.exit("Language detected ({}) is different than the NLP used ({})".format(detect_lang[0], args.l))
doc = nlp(text) doc = nlp(text)
analysis = ["verb", "noun", "hashtag", "mention", analysis = ["verb", "noun", "hashtag", "mention",