2020-08-19 15:33:04 +00:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import redis
import spacy
import argparse
import sys
2020-09-21 05:50:57 +00:00
import simplejson as json
2020-10-09 16:36:33 +00:00
from tabulate import tabulate
2020-10-09 18:47:43 +00:00
import cld3
2020-08-19 15:33:04 +00:00
parser = argparse . ArgumentParser ( description = " Extract statistical analysis of text " )
parser . add_argument ( ' -v ' , help = " verbose output " )
parser . add_argument ( ' -f ' , help = " file to analyse " )
parser . add_argument ( ' -t ' , help = " maximum value for the top list (default is 100) -1 is no limit " , default = 100 )
2020-08-20 11:28:49 +00:00
parser . add_argument ( ' -s ' , help = " display the overall statistics (default is False) " , default = False , action = ' store_true ' )
2020-10-09 05:48:06 +00:00
parser . add_argument ( ' -o ' , help = " output format (default is csv), json, readable " , default = " csv " )
2020-10-08 21:13:51 +00:00
parser . add_argument ( ' -l ' , help = " language used for the analysis (default is en) " , default = " en " )
parser . add_argument ( ' --verbatim ' , help = " Don ' t use the lemmatized form, use verbatim. (default is the lematized form) " , default = False , action = ' store_true ' )
2020-10-08 21:22:00 +00:00
parser . add_argument ( ' --no-flushdb ' , help = " Don ' t flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run) " , default = False , action = ' store_true ' )
2020-10-08 21:30:57 +00:00
parser . add_argument ( ' --binary ' , help = " Output response in binary instead of UTF-8 (default) " , default = False , action = ' store_true ' )
2020-10-01 21:06:39 +00:00
2020-08-19 15:33:04 +00:00
args = parser . parse_args ( )
if args . f is None :
parser . print_help ( )
sys . exit ( )
2020-10-08 21:30:57 +00:00
if not args . binary :
redisdb = redis . Redis ( host = " localhost " , port = 6380 , db = 5 , encoding = ' utf-8 ' , decode_responses = True )
else :
redisdb = redis . Redis ( host = " localhost " , port = 6380 , db = 5 )
2020-08-19 15:33:04 +00:00
try :
2020-10-08 21:22:00 +00:00
redisdb . ping ( )
2020-08-19 15:33:04 +00:00
except :
print ( " Redis database on port 6380 is not running... " , file = sys . stderr )
sys . exit ( )
2020-10-08 21:22:00 +00:00
if not args . no_flushdb :
redisdb . flushdb ( )
2020-10-01 21:06:39 +00:00
if args . l == " fr " :
nlp = spacy . load ( " fr_core_news_md " )
2020-10-09 18:47:43 +00:00
elif args . l == " en " :
2020-10-01 21:06:39 +00:00
nlp = spacy . load ( " en_core_web_md " )
2020-10-09 18:47:43 +00:00
else :
sys . exit ( " Language not supported " )
2020-08-19 15:33:04 +00:00
nlp . max_length = 2000000
with open ( args . f , ' r ' ) as file :
text = file . read ( )
2020-10-09 18:47:43 +00:00
detect_lang = cld3 . get_language ( text )
if detect_lang [ 0 ] != args . l :
sys . exit ( " Language detected ( {} ) is different than the NLP used ( {} ) " . format ( detect_lang [ 0 ] , args . l ) )
2020-08-19 15:33:04 +00:00
doc = nlp ( text )
2020-10-09 05:18:16 +00:00
analysis = [ " verb " , " noun " , " hashtag " , " mention " ,
" digit " , " url " , " oov " , " labels " ,
" punct " ]
2020-08-19 15:33:04 +00:00
2020-08-20 11:28:49 +00:00
redisdb . hset ( " stats " , " token " , doc . __len__ ( ) )
2020-08-19 15:33:04 +00:00
for token in doc :
2020-10-09 19:17:03 +00:00
if token . pos_ == " VERB " and not token . is_oov and len ( token ) > 1 :
2020-10-08 21:13:51 +00:00
if not args . verbatim :
2020-10-09 05:18:16 +00:00
redisdb . zincrby ( " verb " , 1 , token . lemma_ )
2020-10-08 21:13:51 +00:00
else :
2020-10-09 05:18:16 +00:00
redisdb . zincrby ( " verb " , 1 , token . text )
redisdb . hincrby ( " stats " , " verb " , 1 )
2020-08-19 15:33:04 +00:00
continue
2020-10-09 19:17:03 +00:00
if token . pos_ == " NOUN " and not token . is_oov and len ( token ) > 1 :
2020-10-08 21:13:51 +00:00
if not args . verbatim :
2020-10-09 05:18:16 +00:00
redisdb . zincrby ( " noun " , 1 , token . lemma_ )
2020-10-08 21:13:51 +00:00
else :
2020-10-09 05:18:16 +00:00
redisdb . zincrby ( " noun " , 1 , token . text )
redisdb . hincrby ( " stats " , " noun " , 1 )
2020-08-19 15:33:04 +00:00
continue
2020-10-09 05:25:26 +00:00
if token . pos_ == " PUNCT " and not token . is_oov :
redisdb . zincrby ( " punct " , 1 , value )
redisdb . hincrby ( " stats " , " punct " , 1 )
continue
2020-08-19 15:33:04 +00:00
if token . is_oov :
value = " {} " . format ( token )
if value . startswith ( ' # ' ) :
2020-10-09 05:18:16 +00:00
redisdb . zincrby ( " hashtag " , 1 , value [ 1 : ] )
redisdb . hincrby ( " stats " , " hashtag " , 1 )
2020-08-19 15:33:04 +00:00
continue
if value . startswith ( ' @ ' ) :
2020-10-09 05:18:16 +00:00
redisdb . zincrby ( " mention " , 1 , value [ 1 : ] )
redisdb . hincrby ( " stats " , " mention " , 1 )
2020-08-19 15:33:04 +00:00
continue
if token . is_digit :
2020-10-09 05:18:16 +00:00
redisdb . zincrby ( " digit " , 1 , value )
redisdb . hincrby ( " stats " , " digit " , 1 )
2020-08-19 15:33:04 +00:00
continue
if token . is_space :
2020-10-09 05:18:16 +00:00
redisdb . hincrby ( " stats " , " space " , 1 )
2020-08-19 15:33:04 +00:00
continue
if token . like_url :
2020-10-09 05:18:16 +00:00
redisdb . zincrby ( " url " , 1 , value )
redisdb . hincrby ( " stats " , " url " , 1 )
2020-08-19 15:33:04 +00:00
continue
if token . like_email :
2020-10-09 05:18:16 +00:00
redisdb . zincrby ( " email " , 1 , value )
redisdb . hincrby ( " stats " , " email " , 1 )
2020-08-19 15:33:04 +00:00
continue
2020-10-09 05:18:16 +00:00
redisdb . zincrby ( " oov " , 1 , value )
redisdb . hincrby ( " stats " , " oov " , 1 )
2020-08-19 15:33:04 +00:00
for entity in doc . ents :
2020-10-09 05:18:16 +00:00
redisdb . zincrby ( " labels " , 1 , entity . label_ )
2020-08-19 15:33:04 +00:00
2020-09-21 05:50:57 +00:00
if args . o == " json " :
output_json = { " format " : " napkin " }
2020-08-19 15:33:04 +00:00
for anal in analysis :
2020-10-09 05:48:06 +00:00
if args . o == " readable " :
previous_value = None
2020-10-09 05:27:08 +00:00
x = redisdb . zrevrange ( anal , 1 , args . t , withscores = True , score_cast_func = int )
2020-09-21 05:50:57 +00:00
if args . o == " csv " :
2020-10-09 15:19:11 +00:00
print ( )
2020-10-09 05:48:06 +00:00
elif args . o == " readable " :
2020-10-09 16:36:33 +00:00
header = [ " \033 [1mTop {} of {} \033 [0m " . format ( args . t , anal ) ]
readable_table = [ ]
2020-09-21 05:50:57 +00:00
elif args . o == " json " :
output_json . update ( { anal : [ ] } )
2020-08-19 15:33:04 +00:00
for a in x :
if args . o == " csv " :
2020-10-09 15:19:11 +00:00
print ( " {} , {} , {} " . format ( anal , a [ 0 ] , a [ 1 ] ) )
2020-10-09 05:48:06 +00:00
elif args . o == " readable " :
if previous_value is None :
previous_value = a [ 1 ]
elif previous_value == a [ 1 ] :
2020-10-09 16:36:33 +00:00
readable_table . append ( [ " {} " . format ( a [ 0 ] ) ] )
2020-10-09 05:48:06 +00:00
elif a [ 1 ] < previous_value :
previous_value = a [ 1 ]
2020-10-09 16:36:33 +00:00
readable_table . append ( [ " {} occurences " . format ( a [ 1 ] ) ] )
readable_table . append ( [ " {} " . format ( a [ 0 ] ) ] )
2020-09-21 05:50:57 +00:00
elif args . o == " json " :
output_json [ anal ] . append ( a )
2020-10-09 16:36:33 +00:00
if args . o == " readable " :
print ( tabulate ( readable_table , header , tablefmt = " fancy_grid " ) )
2020-09-21 05:50:57 +00:00
if args . o == " csv " :
2020-10-09 05:48:06 +00:00
print ( " # " )
2020-08-19 15:33:04 +00:00
2020-08-20 11:28:49 +00:00
if args . s :
2020-10-09 05:48:06 +00:00
print ( redisdb . hgetall ( ' stats ' ) )
2020-09-21 05:50:57 +00:00
if args . o == " json " :
print ( json . dumps ( output_json ) )