From fb289cec1bb613daf1a7100533871bfab3e23af3 Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy <a@foo.be>
Date: Fri, 9 Oct 2020 21:17:03 +0200
Subject: [PATCH] chg: [analysis] get rid of single char token in the analysis

TODO: What about Chinese and alike? Need to be tested
---
 bin/napkin.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/bin/napkin.py b/bin/napkin.py
index 7a2e6b9..164d564 100644
--- a/bin/napkin.py
+++ b/bin/napkin.py
@@ -64,14 +64,14 @@ analysis = ["verb", "noun", "hashtag", "mention",
 redisdb.hset("stats", "token", doc.__len__())
 
 for token in doc:
-        if token.pos_ == "VERB" and not token.is_oov:
+        if token.pos_ == "VERB" and not token.is_oov and len(token) > 1:
             if not args.verbatim:
                 redisdb.zincrby("verb", 1, token.lemma_)
             else:
                 redisdb.zincrby("verb", 1, token.text)
             redisdb.hincrby("stats", "verb", 1)
             continue
-        if token.pos_ == "NOUN" and not token.is_oov:
+        if token.pos_ == "NOUN" and not token.is_oov and len(token) > 1:
             if not args.verbatim:
                 redisdb.zincrby("noun", 1, token.lemma_)
             else:
@@ -83,7 +83,6 @@ for token in doc:
             redisdb.hincrby("stats", "punct", 1)
             continue
 
-
         if token.is_oov:
             value = "{}".format(token)
             if value.startswith('#'):