mirror of
https://github.com/adulau/napkin-text-analysis.git
synced 2024-11-24 10:57:07 +00:00
new: [input] you can now read from stdin directly with -i
This commit is contained in:
parent
b1ddcfa53c
commit
a6d5a5bbe4
1 changed files with 11 additions and 3 deletions
|
@ -8,6 +8,7 @@ import sys
|
|||
import simplejson as json
|
||||
from tabulate import tabulate
|
||||
import cld3
|
||||
import fileinput
|
||||
|
||||
version = "0.9"
|
||||
|
||||
|
@ -18,6 +19,7 @@ parser.add_argument('-t', help="maximum value for the top list (default is 100)
|
|||
parser.add_argument('-s', help="display the overall statistics (default is False)", default=False, action='store_true')
|
||||
parser.add_argument('-o', help="output format (default is csv), json, readable", default="csv")
|
||||
parser.add_argument('-l', help="language used for the analysis (default is en)", default="en")
|
||||
parser.add_argument('-i', help="Use stdin instead of a filename", default=False, action='store_true')
|
||||
parser.add_argument('--verbatim', help="Don't use the lemmatized form, use verbatim. (default is the lematized form)", default=False, action='store_true')
|
||||
parser.add_argument('--no-flushdb', help="Don't flush the redisdb, useful when you want to process multiple files and aggregate the results. (by default the redis database is flushed at each run)", default=False, action='store_true')
|
||||
parser.add_argument('--binary', help="set output in binary instead of UTF-8 (default)", default=False, action='store_true')
|
||||
|
@ -31,7 +33,7 @@ parser.add_argument('--full-labels', help="store each label value in a ranked se
|
|||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.f is None:
|
||||
if args.f is None and not args.i:
|
||||
parser.print_help()
|
||||
sys.exit()
|
||||
|
||||
|
@ -77,9 +79,15 @@ else:
|
|||
|
||||
nlp.max_length = 2000000
|
||||
|
||||
with open(args.f, 'r') as file:
|
||||
if args.f:
|
||||
with open(args.f, 'r') as file:
|
||||
text = file.read()
|
||||
|
||||
if args.i:
|
||||
text = ""
|
||||
for line in sys.stdin:
|
||||
text = text + line
|
||||
|
||||
detect_lang = cld3.get_language(text)
|
||||
if detect_lang[0] != args.l:
|
||||
sys.exit("Language detected ({}) is different than the NLP used ({})".format(detect_lang[0], args.l))
|
||||
|
|
Loading…
Reference in a new issue