From fb289cec1bb613daf1a7100533871bfab3e23af3 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Fri, 9 Oct 2020 21:17:03 +0200 Subject: [PATCH] chg: [analysis] get rid of single char token in the analysis TODO: What about Chinese and alike? Need to be tested --- bin/napkin.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/napkin.py b/bin/napkin.py index 7a2e6b9..164d564 100644 --- a/bin/napkin.py +++ b/bin/napkin.py @@ -64,14 +64,14 @@ analysis = ["verb", "noun", "hashtag", "mention", redisdb.hset("stats", "token", doc.__len__()) for token in doc: - if token.pos_ == "VERB" and not token.is_oov: + if token.pos_ == "VERB" and not token.is_oov and len(token) > 1: if not args.verbatim: redisdb.zincrby("verb", 1, token.lemma_) else: redisdb.zincrby("verb", 1, token.text) redisdb.hincrby("stats", "verb", 1) continue - if token.pos_ == "NOUN" and not token.is_oov: + if token.pos_ == "NOUN" and not token.is_oov and len(token) > 1: if not args.verbatim: redisdb.zincrby("noun", 1, token.lemma_) else: @@ -83,7 +83,6 @@ for token in doc: redisdb.hincrby("stats", "punct", 1) continue - if token.is_oov: value = "{}".format(token) if value.startswith('#'):