mirror of
https://github.com/adulau/napkin-text-analysis.git
synced 2024-12-22 08:36:00 +00:00
chg: [analysis] get rid of single char token in the analysis
TODO: What about Chinese and alike? Need to be tested
This commit is contained in:
parent
02938bd464
commit
fb289cec1b
1 changed files with 2 additions and 3 deletions
|
@ -64,14 +64,14 @@ analysis = ["verb", "noun", "hashtag", "mention",
|
|||
redisdb.hset("stats", "token", doc.__len__())
|
||||
|
||||
for token in doc:
|
||||
if token.pos_ == "VERB" and not token.is_oov:
|
||||
if token.pos_ == "VERB" and not token.is_oov and len(token) > 1:
|
||||
if not args.verbatim:
|
||||
redisdb.zincrby("verb", 1, token.lemma_)
|
||||
else:
|
||||
redisdb.zincrby("verb", 1, token.text)
|
||||
redisdb.hincrby("stats", "verb", 1)
|
||||
continue
|
||||
if token.pos_ == "NOUN" and not token.is_oov:
|
||||
if token.pos_ == "NOUN" and not token.is_oov and len(token) > 1:
|
||||
if not args.verbatim:
|
||||
redisdb.zincrby("noun", 1, token.lemma_)
|
||||
else:
|
||||
|
@ -83,7 +83,6 @@ for token in doc:
|
|||
redisdb.hincrby("stats", "punct", 1)
|
||||
continue
|
||||
|
||||
|
||||
if token.is_oov:
|
||||
value = "{}".format(token)
|
||||
if value.startswith('#'):
|
||||
|
|
Loading…
Reference in a new issue