chg: [analysis] get rid of single char token in the analysis

TODO: What about Chinese and alike? Need to be tested
This commit is contained in:
Alexandre Dulaunoy 2020-10-09 21:17:03 +02:00
parent 02938bd464
commit fb289cec1b
Signed by: adulau
GPG key ID: 09E2CD4944E6CBCD

View file

@ -64,14 +64,14 @@ analysis = ["verb", "noun", "hashtag", "mention",
redisdb.hset("stats", "token", doc.__len__())
for token in doc:
if token.pos_ == "VERB" and not token.is_oov:
if token.pos_ == "VERB" and not token.is_oov and len(token) > 1:
if not args.verbatim:
redisdb.zincrby("verb", 1, token.lemma_)
else:
redisdb.zincrby("verb", 1, token.text)
redisdb.hincrby("stats", "verb", 1)
continue
if token.pos_ == "NOUN" and not token.is_oov:
if token.pos_ == "NOUN" and not token.is_oov and len(token) > 1:
if not args.verbatim:
redisdb.zincrby("noun", 1, token.lemma_)
else:
@ -83,7 +83,6 @@ for token in doc:
redisdb.hincrby("stats", "punct", 1)
continue
if token.is_oov:
value = "{}".format(token)
if value.startswith('#'):