mirror of
https://github.com/adulau/napkin-text-analysis.git
synced 2024-11-24 10:57:07 +00:00
new: [input] you can now read from stdin directly with -i
This commit is contained in:
parent
b1ddcfa53c
commit
a6d5a5bbe4
1 changed files with 11 additions and 3 deletions
|
@ -8,6 +8,7 @@ import sys
|
||||||
import simplejson as json
|
import simplejson as json
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
import cld3
|
import cld3
|
||||||
|
import fileinput
|
||||||
|
|
||||||
version = "0.9"
|
version = "0.9"
|
||||||
|
|
||||||
|
@ -18,6 +19,7 @@ parser.add_argument('-t', help="maximum value for the top list (default is 100)
|
||||||
parser.add_argument('-s', help="display the overall statistics (default is False)", default=False, action='store_true')
|
parser.add_argument('-s', help="display the overall statistics (default is False)", default=False, action='store_true')
|
||||||
parser.add_argument('-o', help="output format (default is csv), json, readable", default="csv")
|
parser.add_argument('-o', help="output format (default is csv), json, readable", default="csv")
|
||||||
parser.add_argument('-l', help="language used for the analysis (default is en)", default="en")
|
parser.add_argument('-l', help="language used for the analysis (default is en)", default="en")
|
||||||
|
parser.add_argument('-i', help="Use stdin instead of a filename", default=False, action='store_true')
|
||||||
parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verbatim. (default is the lematized form)", default=False, action='store_true')
|
parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verbatim. (default is the lematized form)", default=False, action='store_true')
|
||||||
parser.add_argument('--no-flushdb', help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)", default=False, action='store_true')
|
parser.add_argument('--no-flushdb', help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)", default=False, action='store_true')
|
||||||
parser.add_argument('--binary', help="set output in binary instead of UTF-8 (default)", default=False, action='store_true')
|
parser.add_argument('--binary', help="set output in binary instead of UTF-8 (default)", default=False, action='store_true')
|
||||||
|
@ -31,7 +33,7 @@ parser.add_argument('--full-labels', help="store each label value in a ranked se
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.f is None:
|
if args.f is None and not args.i:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
@ -77,9 +79,15 @@ else:
|
||||||
|
|
||||||
nlp.max_length = 2000000
|
nlp.max_length = 2000000
|
||||||
|
|
||||||
with open(args.f, 'r') as file:
|
if args.f:
|
||||||
|
with open(args.f, 'r') as file:
|
||||||
text = file.read()
|
text = file.read()
|
||||||
|
|
||||||
|
if args.i:
|
||||||
|
text = ""
|
||||||
|
for line in sys.stdin:
|
||||||
|
text = text + line
|
||||||
|
|
||||||
detect_lang = cld3.get_language(text)
|
detect_lang = cld3.get_language(text)
|
||||||
if detect_lang[0] != args.l:
|
if detect_lang[0] != args.l:
|
||||||
sys.exit("Language detected ({}) is different than the NLP used ({})".format(detect_lang[0], args.l))
|
sys.exit("Language detected ({}) is different than the NLP used ({})".format(detect_lang[0], args.l))
|
||||||
|
|
Loading…
Reference in a new issue