chg: [analysis] get rid of single char token in the analysis

TODO: What about Chinese and alike? Need to be tested
2024-11-22 01:47:06 +00:00 · 2020-10-09 21:17:03 +02:00 · 2020-10-09 21:17:03 +02:00 · fb289cec1b
commit fb289cec1b
parent 02938bd464
1 changed files with 2 additions and 3 deletions
--- a/bin/napkin.py
+++ b/bin/napkin.py
@ -64,14 +64,14 @@ analysis = ["verb", "noun", "hashtag", "mention",
 redisdb.hset("stats", "token", doc.__len__())
 for token in doc:
-        if token.pos_ == "VERB" and not token.is_oov:
+        if token.pos_ == "VERB" and not token.is_oov and len(token) > 1:
            if not args.verbatim:
                redisdb.zincrby("verb", 1, token.lemma_)
            else:
                redisdb.zincrby("verb", 1, token.text)
            redisdb.hincrby("stats", "verb", 1)
            continue
-        if token.pos_ == "NOUN" and not token.is_oov:
+        if token.pos_ == "NOUN" and not token.is_oov and len(token) > 1:
            if not args.verbatim:
                redisdb.zincrby("noun", 1, token.lemma_)
            else:
@ -83,7 +83,6 @@ for token in doc:
            redisdb.hincrby("stats", "punct", 1)
            continue
        if token.is_oov:
            value = "{}".format(token)
            if value.startswith('#'):