mirror of
https://github.com/adulau/napkin-text-analysis.git
synced 2024-11-22 01:47:06 +00:00
chg: [analysis] get rid of single char token in the analysis
TODO: What about Chinese and alike? Need to be tested
This commit is contained in:
parent
02938bd464
commit
fb289cec1b
1 changed files with 2 additions and 3 deletions
|
@ -64,14 +64,14 @@ analysis = ["verb", "noun", "hashtag", "mention",
|
||||||
redisdb.hset("stats", "token", doc.__len__())
|
redisdb.hset("stats", "token", doc.__len__())
|
||||||
|
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if token.pos_ == "VERB" and not token.is_oov:
|
if token.pos_ == "VERB" and not token.is_oov and len(token) > 1:
|
||||||
if not args.verbatim:
|
if not args.verbatim:
|
||||||
redisdb.zincrby("verb", 1, token.lemma_)
|
redisdb.zincrby("verb", 1, token.lemma_)
|
||||||
else:
|
else:
|
||||||
redisdb.zincrby("verb", 1, token.text)
|
redisdb.zincrby("verb", 1, token.text)
|
||||||
redisdb.hincrby("stats", "verb", 1)
|
redisdb.hincrby("stats", "verb", 1)
|
||||||
continue
|
continue
|
||||||
if token.pos_ == "NOUN" and not token.is_oov:
|
if token.pos_ == "NOUN" and not token.is_oov and len(token) > 1:
|
||||||
if not args.verbatim:
|
if not args.verbatim:
|
||||||
redisdb.zincrby("noun", 1, token.lemma_)
|
redisdb.zincrby("noun", 1, token.lemma_)
|
||||||
else:
|
else:
|
||||||
|
@ -83,7 +83,6 @@ for token in doc:
|
||||||
redisdb.hincrby("stats", "punct", 1)
|
redisdb.hincrby("stats", "punct", 1)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
if token.is_oov:
|
if token.is_oov:
|
||||||
value = "{}".format(token)
|
value = "{}".format(token)
|
||||||
if value.startswith('#'):
|
if value.startswith('#'):
|
||||||
|
|
Loading…
Reference in a new issue