2020-08-19 15:33:04 +00:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import redis
import spacy
import argparse
import sys
2020-09-21 05:50:57 +00:00
import simplejson as json
2020-10-09 16:36:33 +00:00
from tabulate import tabulate
2020-10-09 18:47:43 +00:00
import cld3
2020-11-22 13:19:21 +00:00
import fileinput
2020-08-19 15:33:04 +00:00
2020-10-13 05:24:35 +00:00
version = " 0.9 "
2020-08-19 15:33:04 +00:00
parser = argparse . ArgumentParser ( description = " Extract statistical analysis of text " )
parser . add_argument ( ' -v ' , help = " verbose output " )
parser . add_argument ( ' -f ' , help = " file to analyse " )
2024-02-25 14:53:15 +00:00
parser . add_argument (
' -t ' ,
help = " maximum value for the top list (default is 100) -1 is no limit " ,
default = 100 ,
)
parser . add_argument (
' -s ' ,
help = " display the overall statistics (default is False) " ,
default = False ,
action = ' store_true ' ,
)
parser . add_argument (
' -o ' , help = " output format (default is csv), json, readable " , default = " csv "
)
parser . add_argument (
' -l ' , help = " language used for the analysis (default is en) " , default = " en "
)
parser . add_argument (
' -i ' , help = " Use stdin instead of a filename " , default = False , action = ' store_true '
)
parser . add_argument (
' --verbatim ' ,
help = " Don ' t use the lemmatized form, use verbatim. (default is the lematized form) " ,
default = False ,
action = ' store_true ' ,
)
parser . add_argument (
' --no-flushdb ' ,
help = " Don ' t flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run) " ,
default = False ,
action = ' store_true ' ,
)
parser . add_argument (
' --binary ' ,
help = " set output in binary instead of UTF-8 (default) " ,
default = False ,
action = ' store_true ' ,
)
parser . add_argument (
' --analysis ' ,
help = " Limit output to a specific analysis (verb, noun, hashtag, mention, digit, url, oov, labels, punct). (Default is all analysis are displayed) " ,
default = ' all ' ,
)
parser . add_argument (
' --disable-parser ' ,
help = " disable parser component in Spacy " ,
default = False ,
action = ' store_true ' ,
)
parser . add_argument (
' --disable-tagger ' ,
help = " disable tagger component in Spacy " ,
default = False ,
action = ' store_true ' ,
)
parser . add_argument (
' --token-span ' ,
default = None ,
help = ' Find the sentences where a specific token is located ' ,
)
parser . add_argument (
' --table-format ' ,
help = " set tabulate format (default is fancy_grid) " ,
default = " fancy_grid " ,
)
parser . add_argument (
' --full-labels ' ,
help = " store each label value in a ranked set (default is False) " ,
action = ' store_true ' ,
default = False ,
)
# parser.add_argument('--geolocation', help="export geolocation (default is False)", action='store_true', default=False)
2020-10-18 20:37:00 +00:00
2020-08-19 15:33:04 +00:00
args = parser . parse_args ( )
2020-10-18 20:37:00 +00:00
2020-11-22 13:19:21 +00:00
if args . f is None and not args . i :
2020-08-19 15:33:04 +00:00
parser . print_help ( )
sys . exit ( )
2024-02-25 14:53:15 +00:00
# if args.geolocation:
2020-10-18 20:37:00 +00:00
# args.full_labels = True
2020-10-08 21:30:57 +00:00
if not args . binary :
2024-02-25 14:53:15 +00:00
redisdb = redis . Redis (
host = " localhost " , port = 6379 , db = 5 , encoding = ' utf-8 ' , decode_responses = True
)
2020-10-08 21:30:57 +00:00
else :
2020-10-19 06:52:23 +00:00
redisdb = redis . Redis ( host = " localhost " , port = 6379 , db = 5 )
2020-08-19 15:33:04 +00:00
try :
2020-10-08 21:22:00 +00:00
redisdb . ping ( )
2020-08-19 15:33:04 +00:00
except :
2020-10-19 06:52:23 +00:00
print ( " Redis database on port 6379 is not running... " , file = sys . stderr )
2020-08-19 15:33:04 +00:00
sys . exit ( )
2020-10-08 21:22:00 +00:00
if not args . no_flushdb :
redisdb . flushdb ( )
2020-10-11 09:04:30 +00:00
disable = [ ]
if args . disable_parser :
disable . append ( " parser " )
if args . disable_tagger :
disable . append ( " tagger " )
2020-10-01 21:06:39 +00:00
if args . l == " fr " :
2024-02-25 14:53:15 +00:00
try :
2020-10-18 20:37:00 +00:00
nlp = spacy . load ( " fr_core_news_md " , disable = disable )
except :
print ( " Downloading missing model " )
spacy . cli . download ( " en_core_web_md " )
2020-10-11 09:04:30 +00:00
nlp = spacy . load ( " fr_core_news_md " , disable = disable )
2020-10-09 18:47:43 +00:00
elif args . l == " en " :
2020-10-18 20:37:00 +00:00
try :
nlp = spacy . load ( " en_core_web_md " , disable = disable )
except :
print ( " Downloading missing model " )
spacy . cli . download ( " en_core_web_md " )
2020-10-11 09:04:30 +00:00
nlp = spacy . load ( " en_core_web_md " , disable = disable )
2020-10-09 18:47:43 +00:00
else :
sys . exit ( " Language not supported " )
2020-08-19 15:33:04 +00:00
nlp . max_length = 2000000
2020-11-22 13:19:21 +00:00
if args . f :
with open ( args . f , ' r ' ) as file :
text = file . read ( )
if args . i :
text = " "
for line in sys . stdin :
text = text + line
2020-08-19 15:33:04 +00:00
2020-10-09 18:47:43 +00:00
detect_lang = cld3 . get_language ( text )
if detect_lang [ 0 ] != args . l :
2024-02-25 14:53:15 +00:00
sys . exit (
" Language detected ( {} ) is different than the NLP used ( {} ) " . format (
detect_lang [ 0 ] , args . l
)
)
2020-10-09 18:47:43 +00:00
2020-08-19 15:33:04 +00:00
doc = nlp ( text )
2024-02-25 14:53:15 +00:00
analysis = [
" verb " ,
" noun " ,
" hashtag " ,
" mention " ,
" digit " ,
" url " ,
" oov " ,
" labels " ,
" punct " ,
" email " ,
]
2020-08-19 15:33:04 +00:00
2020-10-11 09:24:17 +00:00
if args . token_span and not disable :
analysis . append ( " span " )
2020-08-20 11:28:49 +00:00
redisdb . hset ( " stats " , " token " , doc . __len__ ( ) )
2024-02-25 14:53:15 +00:00
labels = [
" EVENT " ,
" PERCENT " ,
" MONEY " ,
" FAC " ,
" TIME " ,
" QUANTITY " ,
" WORK_OF_ART " ,
" LANGUAGE " ,
" PRODUCT " ,
" LOC " ,
" LAW " ,
" DATE " ,
" ORDINAL " ,
" NORP " ,
" ORG " ,
" CARDINAL " ,
" GPE " ,
" PERSON " ,
]
2020-10-15 05:12:15 +00:00
for entity in doc . ents :
2024-02-25 14:53:15 +00:00
redisdb . zincrby ( " labels " , 1 , entity . label_ )
if not args . full_labels :
continue
if entity . label_ in labels :
redisdb . zincrby ( " label: {} " . format ( entity . label_ ) , 1 , entity . text )
2020-10-15 05:12:15 +00:00
2020-08-19 15:33:04 +00:00
for token in doc :
2024-02-25 14:53:15 +00:00
if args . token_span is not None and not disable :
if token . text == args . token_span :
redisdb . zincrby ( " span " , 1 , token . sent . as_doc ( ) . text )
if token . pos_ == " VERB " and not token . is_oov and len ( token ) > 1 :
if not args . verbatim :
redisdb . zincrby ( " verb " , 1 , token . lemma_ )
else :
redisdb . zincrby ( " verb " , 1 , token . text )
redisdb . hincrby ( " stats " , " verb " , 1 )
continue
if token . pos_ == " NOUN " and not token . is_oov and len ( token ) > 1 :
if not args . verbatim :
redisdb . zincrby ( " noun " , 1 , token . lemma_ )
else :
redisdb . zincrby ( " noun " , 1 , token . text )
redisdb . hincrby ( " stats " , " noun " , 1 )
continue
if token . pos_ == " PUNCT " and not token . is_oov :
redisdb . zincrby ( " punct " , 1 , " {} " . format ( token ) )
redisdb . hincrby ( " stats " , " punct " , 1 )
continue
if token . is_oov :
value = " {} " . format ( token )
if value . startswith ( ' # ' ) :
redisdb . zincrby ( " hashtag " , 1 , value [ 1 : ] )
redisdb . hincrby ( " stats " , " hashtag " , 1 )
2020-08-19 15:33:04 +00:00
continue
2024-02-25 14:53:15 +00:00
if value . startswith ( ' @ ' ) :
redisdb . zincrby ( " mention " , 1 , value [ 1 : ] )
redisdb . hincrby ( " stats " , " mention " , 1 )
2020-08-19 15:33:04 +00:00
continue
2024-02-25 14:53:15 +00:00
if token . is_digit :
redisdb . zincrby ( " digit " , 1 , value )
redisdb . hincrby ( " stats " , " digit " , 1 )
2020-10-09 05:25:26 +00:00
continue
2024-02-25 14:53:15 +00:00
if token . is_space :
redisdb . hincrby ( " stats " , " space " , 1 )
continue
if token . like_url :
redisdb . zincrby ( " url " , 1 , value )
redisdb . hincrby ( " stats " , " url " , 1 )
continue
if token . like_email :
redisdb . zincrby ( " email " , 1 , value )
redisdb . hincrby ( " stats " , " email " , 1 )
continue
redisdb . zincrby ( " oov " , 1 , value )
redisdb . hincrby ( " stats " , " oov " , 1 )
2020-08-19 15:33:04 +00:00
2020-09-21 05:50:57 +00:00
if args . o == " json " :
2024-02-25 14:53:15 +00:00
output_json = { " format " : " napkin " , " version " : version }
2020-08-19 15:33:04 +00:00
for anal in analysis :
2024-02-25 14:53:15 +00:00
more_info = " "
if args . analysis == " all " or args . analysis == anal :
pass
else :
continue
if anal == " span " :
more_info = " for {} " . format ( args . token_span )
if args . o == " readable " :
previous_value = None
x = redisdb . zrevrange ( anal , 0 , args . t , withscores = True , score_cast_func = int )
if args . o == " csv " :
print ( )
elif args . o == " readable " :
header = [ " \033 [1mTop {} of {} {} \033 [0m " . format ( args . t , anal , more_info ) ]
readable_table = [ ]
elif args . o == " json " :
output_json . update ( { anal : [ ] } )
for a in x :
2020-09-21 05:50:57 +00:00
if args . o == " csv " :
2024-02-25 14:53:15 +00:00
print ( " {} , {} , {} " . format ( anal , a [ 0 ] , a [ 1 ] ) )
2020-10-09 05:48:06 +00:00
elif args . o == " readable " :
2024-02-25 14:53:15 +00:00
if previous_value == a [ 1 ] :
readable_table . append ( [ " {} " . format ( a [ 0 ] ) ] )
elif previous_value is None or a [ 1 ] < previous_value :
previous_value = a [ 1 ]
readable_table . append ( [ " {} occurences " . format ( a [ 1 ] ) ] )
readable_table . append ( [ " {} " . format ( a [ 0 ] ) ] )
2020-09-21 05:50:57 +00:00
elif args . o == " json " :
2024-02-25 14:53:15 +00:00
output_json [ anal ] . append ( a )
if args . o == " readable " :
print ( tabulate ( readable_table , header , tablefmt = args . table_format ) )
if args . o == " csv " :
print ( " # " )
2020-08-19 15:33:04 +00:00
2020-08-20 11:28:49 +00:00
if args . s :
2020-10-09 05:48:06 +00:00
print ( redisdb . hgetall ( ' stats ' ) )
2020-09-21 05:50:57 +00:00
if args . o == " json " :
print ( json . dumps ( output_json ) )