mirror of
https://github.com/adulau/napkin-text-analysis.git
synced 2024-11-21 17:37:07 +00:00
chg: [langdetect] detection of language before further processing
Before processing the text, we use cld3 to detect the language and compare if the foreseen spacy model to be used.
This commit is contained in:
parent
98a8d8275e
commit
793e7ae9c5
1 changed files with 8 additions and 4 deletions
|
@ -3,11 +3,11 @@
|
||||||
|
|
||||||
import redis
|
import redis
|
||||||
import spacy
|
import spacy
|
||||||
from spacy_langdetect import LanguageDetector
|
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
import simplejson as json
|
import simplejson as json
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
import cld3
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Extract statistical analysis of text")
|
parser = argparse.ArgumentParser(description="Extract statistical analysis of text")
|
||||||
parser.add_argument('-v', help="verbose output")
|
parser.add_argument('-v', help="verbose output")
|
||||||
|
@ -41,16 +41,20 @@ if not args.no_flushdb:
|
||||||
|
|
||||||
if args.l == "fr":
|
if args.l == "fr":
|
||||||
nlp = spacy.load("fr_core_news_md")
|
nlp = spacy.load("fr_core_news_md")
|
||||||
else:
|
elif args.l == "en":
|
||||||
nlp = spacy.load("en_core_web_md")
|
nlp = spacy.load("en_core_web_md")
|
||||||
|
else:
|
||||||
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
|
sys.exit("Language not supported")
|
||||||
|
|
||||||
nlp.max_length = 2000000
|
nlp.max_length = 2000000
|
||||||
|
|
||||||
with open(args.f, 'r') as file:
|
with open(args.f, 'r') as file:
|
||||||
text = file.read()
|
text = file.read()
|
||||||
|
|
||||||
|
detect_lang = cld3.get_language(text)
|
||||||
|
if detect_lang[0] != args.l:
|
||||||
|
sys.exit("Language detected ({}) is different than the NLP used ({})".format(detect_lang[0], args.l))
|
||||||
|
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
|
|
||||||
analysis = ["verb", "noun", "hashtag", "mention",
|
analysis = ["verb", "noun", "hashtag", "mention",
|
||||||
|
|
Loading…
Reference in a new issue